[libc-commits] [libc] [libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (PR #101917)

Thu Aug 8 11:09:44 PDT 2024

https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/101917

>From 557117d2af74fe156a1d0e7e576e432634e792e1 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 21:52:21 -0400
Subject: [PATCH 1/5] add generic and nvptx sin throughput benchmark

---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 42 +++++++------
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 62 ++++++++-----------
 .../gpu/timing/amdgpu/CMakeLists.txt          |  4 ++
 libc/benchmarks/gpu/timing/amdgpu/timing.h    | 51 ++++++++++-----
 .../gpu/timing/nvptx/CMakeLists.txt           |  4 ++
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 32 ++++++++++
 6 files changed, 123 insertions(+), 72 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed7459..39e4a6e9e0152a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
-  uint32_t min_iterations = 50;
+  uint32_t min_iterations = 1;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+
+    return total_time / N;
   }
 };
 
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-
 #endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb09..17b31402b3bd75 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
+        Func, MIN_EXP, MAX_EXP);                                               \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
 
-#define BM_TWO_PI(Func)                                                        \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
-
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));           \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));         \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));        \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
-#endif
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+#endif
\ No newline at end of file
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09ae..aa5dcd33bee9c8 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e95695..e53eb25f83930f 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -14,17 +14,10 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/CPP/array.h"
 
 #include <stdint.h>
 
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
-  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
-    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
-  else                                                                         \
-    asm("" ::"v"(VARIABLE))
-
 namespace LIBC_NAMESPACE_DECL {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -50,7 +43,8 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  FORCE_TO_REGISTER(T, arg);
+  // VGPR constraints can only accept primitive values.
+  asm("" ::"v"(&arg));
 
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
@@ -59,8 +53,7 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
-  asm("" ::"s"(start));
+  asm("" ::"s"(start), "v"(&arg));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -87,15 +80,12 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
+  asm("" ::"v"(&arg1), "v"(&arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
-  asm("" ::"s"(start));
+  asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
 
   auto result = f(arg1, arg2);
 
@@ -109,6 +99,35 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+  volatile auto storage = &inputs;
+  auto array_pointer = storage;
+  asm("" ::"v"(array_pointer));
+  auto register_array = *array_pointer;
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start), "v"(array_pointer));
+
+  for (auto input : register_array) {
+    auto result = f(input);
+
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a410..2723c8940814c6 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea1535..dee8d6ea41f474 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -94,6 +96,36 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+  volatile auto storage = &inputs;
+  auto array_pointer = storage;
+  asm("" ::"r"(array_pointer));
+  auto register_array = *array_pointer;
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"r"(array_pointer), "llr"(start));
+
+  uint64_t result;
+  for (auto input : register_array) {
+    asm("" ::"r"(input));
+    result = f(input);
+    asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

>From bffe71a26f23a474eba48c5b19c8794a7705a293 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 21:58:48 -0400
Subject: [PATCH 2/5] add trailing new line

---
 libc/benchmarks/gpu/src/math/sin_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 17b31402b3bd75..03f824deae6a5a 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -52,4 +52,4 @@ BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
 BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
 BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
 BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
-#endif
\ No newline at end of file
+#endif

>From 29635687ad4d4536a2bfa4e7b12319c8fb9192af Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 22:10:23 -0400
Subject: [PATCH 3/5] run clang-format

---
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e53eb25f83930f..30b34de4fc87b0 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,12 +9,12 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/CPP/array.h"
 
 #include <stdint.h>
 

>From df3f0face9bee30141c82ed152e21efc5f29a2c7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 8 Aug 2024 14:04:17 -0400
Subject: [PATCH 4/5] remove arg inline asm capture and capture array pointer
 for throughput on nvptx

---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        |  4 ++--
 .../benchmarks/gpu/src/math/sin_benchmark.cpp |  4 ++--
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 21 +++++++------------
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 39e4a6e9e0152a..830e6f9e89a743 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -149,12 +149,12 @@ template <typename T> class MathPerf {
   typedef T Func(T);
 
   template <size_t N = 1>
-  static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
     cpp::array<T, N> inputs;
     for (size_t i = 0; i < N; ++i)
       inputs[i] = get_rand_input<T>(min_exp, max_exp);
 
-    uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
 
     return total_time / N;
   }
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 03f824deae6a5a..e86961790b9438 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -21,8 +21,8 @@
 // easily register NVPTX benchmarks.
 #define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
-        Func, MIN_EXP, MAX_EXP);                                               \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<                               \
+        double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP);           \
   }
 
 #define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index dee8d6ea41f474..637986abd9092d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   volatile uint32_t x = 1;
   uint32_t y = x;
   uint64_t start = gpu::processor_clock();
-  asm("" ::"r"(y), "llr"(start));
+  asm("" ::"llr"(start));
   uint32_t result = y;
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
@@ -44,7 +44,6 @@ template <typename F, typename T>
   // not constant propagate it and remove the profiling region.
   volatile T storage = t;
   T arg = storage;
-  asm("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
   gpu::memory_fence();
@@ -52,7 +51,7 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock cycle
   // counter before the profiling region.
-  asm("" ::"r"(arg), "llr"(start));
+  asm("" ::"llr"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -78,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T2 storage2 = t2;
   T1 arg = storage;
   T2 arg2 = storage2;
-  asm("" ::"r"(arg), "r"(arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+  asm("" ::"llr"(start));
 
   auto result = f(arg, arg2);
 
@@ -100,22 +98,19 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 // Provides throughput benchmarking.
 template <typename F, typename T, size_t N>
 [[gnu::noinline]] static LIBC_INLINE uint64_t
-latency(F f, const cpp::array<T, N> &inputs) {
-  volatile auto storage = &inputs;
-  auto array_pointer = storage;
-  asm("" ::"r"(array_pointer));
-  auto register_array = *array_pointer;
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"r"(array_pointer), "llr"(start));
+  asm("" ::"llr"(start));
 
   uint64_t result;
-  for (auto input : register_array) {
+  for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
-    asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+    asm("" ::"r"(result));
   }
 
   uint64_t stop = gpu::processor_clock();

>From 47a0f43c93d559596a46b600e9387305b218c85e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 8 Aug 2024 14:08:38 -0400
Subject: [PATCH 5/5] fix asm constraints for amdgpu

---
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 42 +++++++++++-----------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 30b34de4fc87b0..d5c3df27b7de60 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -43,9 +43,6 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  // VGPR constraints can only accept primitive values.
-  asm("" ::"v"(&arg));
-
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
   // Get the current timestamp from the clock.
@@ -53,15 +50,22 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  asm("" ::"s"(start), "v"(&arg));
+  asm("" ::"s"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    // AMDGPU does not support input register constraints for i1 and i8, so we
+    // cast it to a 32-bit integer. This does not add an additional assembly
+    // instruction (https://godbolt.org/z/zxGqv8G91).
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -80,17 +84,19 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  asm("" ::"v"(&arg1), "v"(&arg2));
-
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
+  asm("" ::"s"(start));
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
@@ -102,22 +108,18 @@ template <typename F, typename T1, typename T2>
 // Provides throughput benchmarking.
 template <typename F, typename T, size_t N>
 [[gnu::noinline]] static LIBC_INLINE uint64_t
-latency(F f, const cpp::array<T, N> &inputs) {
-  volatile auto storage = &inputs;
-  auto array_pointer = storage;
-  asm("" ::"v"(array_pointer));
-  auto register_array = *array_pointer;
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"v"(&inputs));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"s"(start), "v"(array_pointer));
+  asm("" ::"s"(start));
 
-  for (auto input : register_array) {
+  for (auto input : inputs) {
     auto result = f(input);
 
-    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-        static_cast<uint32_t>(result)));
+    asm("" ::"v"(result));
   }
 
   uint64_t stop = gpu::processor_clock();