[libc-commits] [libc] [libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (PR #101917)
via libc-commits
libc-commits at lists.llvm.org
Thu Aug 8 11:09:44 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/101917
>From 557117d2af74fe156a1d0e7e576e432634e792e1 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 21:52:21 -0400
Subject: [PATCH 1/5] add generic and nvptx sin throughput benchmark
---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 42 +++++++------
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 62 ++++++++-----------
.../gpu/timing/amdgpu/CMakeLists.txt | 4 ++
libc/benchmarks/gpu/timing/amdgpu/timing.h | 51 ++++++++++-----
.../gpu/timing/nvptx/CMakeLists.txt | 4 ++
libc/benchmarks/gpu/timing/nvptx/timing.h | 32 ++++++++++
6 files changed, 123 insertions(+), 72 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed7459..39e4a6e9e0152a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
- uint32_t min_iterations = 50;
+ uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
};
// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+// -EXP_BIAS corresponding to denormal values,
+// EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+ int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
// Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+ double scale =
+ static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
public:
typedef T Func(T);
- static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
- StorageType ending_bit, StorageType step) {
- uint64_t total_time = 0;
- if (step <= 0)
- step = 1;
- volatile T result;
- for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
- T x = FPBits(bits).get_val();
- total_time += LIBC_NAMESPACE::latency(f, x);
- }
- StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
- return total_time / num_runs;
+ template <size_t N = 1>
+ static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+ cpp::array<T, N> inputs;
+ for (size_t i = 0; i < N; ++i)
+ inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+ uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+
+ return total_time / N;
}
};
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
-
#endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb09..17b31402b3bd75 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
#include "src/math/amdgpu/declarations.h"
#endif
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
- return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func) \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
- double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
- return LIBC_NAMESPACE::latency(Func, x); \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
+ Func, MIN_EXP, MAX_EXP); \
}
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-#define BM_TWO_PI(Func) \
- []() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
-
-#define BM_LARGE_INT(Func) \
- []() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
#ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
-#endif
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+#endif
\ No newline at end of file
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09ae..aa5dcd33bee9c8 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e95695..e53eb25f83930f 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -14,17 +14,10 @@
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
+#include "src/__support/CPP/array.h"
#include <stdint.h>
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
- if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
- asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
- else \
- asm("" ::"v"(VARIABLE))
-
namespace LIBC_NAMESPACE_DECL {
// Returns the overhead associated with calling the profiling region. This
@@ -50,7 +43,8 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;
- FORCE_TO_REGISTER(T, arg);
+ // VGPR constraints can only accept primitive values.
+ asm("" ::"v"(&arg));
// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
@@ -59,8 +53,7 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
- FORCE_TO_REGISTER(T, arg);
- asm("" ::"s"(start));
+ asm("" ::"s"(start), "v"(&arg));
// Run the function under test and return its value.
auto result = f(arg);
@@ -87,15 +80,12 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
+ asm("" ::"v"(&arg1), "v"(&arg2));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
- asm("" ::"s"(start));
+ asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
auto result = f(arg1, arg2);
@@ -109,6 +99,35 @@ template <typename F, typename T1, typename T2>
return stop - start;
}
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+ volatile auto storage = &inputs;
+ auto array_pointer = storage;
+ asm("" ::"v"(array_pointer));
+ auto register_array = *array_pointer;
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"s"(start), "v"(array_pointer));
+
+ for (auto input : register_array) {
+ auto result = f(input);
+
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a410..2723c8940814c6 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea1535..dee8d6ea41f474 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
@@ -94,6 +96,36 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
return stop - start;
}
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+ volatile auto storage = &inputs;
+ auto array_pointer = storage;
+ asm("" ::"r"(array_pointer));
+ auto register_array = *array_pointer;
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"r"(array_pointer), "llr"(start));
+
+ uint64_t result;
+ for (auto input : register_array) {
+ asm("" ::"r"(input));
+ result = f(input);
+ asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::memory_fence();
+ asm("" ::"r"(stop));
+ volatile auto output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
>From bffe71a26f23a474eba48c5b19c8794a7705a293 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 21:58:48 -0400
Subject: [PATCH 2/5] add trailing new line
---
libc/benchmarks/gpu/src/math/sin_benchmark.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 17b31402b3bd75..03f824deae6a5a 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -52,4 +52,4 @@ BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
-#endif
\ No newline at end of file
+#endif
>From 29635687ad4d4536a2bfa4e7b12319c8fb9192af Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 4 Aug 2024 22:10:23 -0400
Subject: [PATCH 3/5] run clang-format
---
libc/benchmarks/gpu/timing/amdgpu/timing.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e53eb25f83930f..30b34de4fc87b0 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,12 +9,12 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
-#include "src/__support/CPP/array.h"
#include <stdint.h>
>From df3f0face9bee30141c82ed152e21efc5f29a2c7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 8 Aug 2024 14:04:17 -0400
Subject: [PATCH 4/5] remove arg inline asm capture and capture array pointer
for throughput on nvptx
---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 4 ++--
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 4 ++--
libc/benchmarks/gpu/timing/nvptx/timing.h | 21 +++++++------------
3 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 39e4a6e9e0152a..830e6f9e89a743 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -149,12 +149,12 @@ template <typename T> class MathPerf {
typedef T Func(T);
template <size_t N = 1>
- static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+ static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);
- uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+ uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
return total_time / N;
}
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 03f824deae6a5a..e86961790b9438 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -21,8 +21,8 @@
// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
- Func, MIN_EXP, MAX_EXP); \
+ return LIBC_NAMESPACE::benchmarks::MathPerf< \
+ double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP); \
}
#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index dee8d6ea41f474..637986abd9092d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
volatile uint32_t x = 1;
uint32_t y = x;
uint64_t start = gpu::processor_clock();
- asm("" ::"r"(y), "llr"(start));
+ asm("" ::"llr"(start));
uint32_t result = y;
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
uint64_t stop = gpu::processor_clock();
@@ -44,7 +44,6 @@ template <typename F, typename T>
// not constant propagate it and remove the profiling region.
volatile T storage = t;
T arg = storage;
- asm("" ::"r"(arg));
// Get the current timestamp from the clock.
gpu::memory_fence();
@@ -52,7 +51,7 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock cycle
// counter before the profiling region.
- asm("" ::"r"(arg), "llr"(start));
+ asm("" ::"llr"(start));
// Run the function under test and return its value.
auto result = f(arg);
@@ -78,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T2 storage2 = t2;
T1 arg = storage;
T2 arg2 = storage2;
- asm("" ::"r"(arg), "r"(arg2));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+ asm("" ::"llr"(start));
auto result = f(arg, arg2);
@@ -100,22 +98,19 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
-latency(F f, const cpp::array<T, N> &inputs) {
- volatile auto storage = &inputs;
- auto array_pointer = storage;
- asm("" ::"r"(array_pointer));
- auto register_array = *array_pointer;
+throughput(F f, const cpp::array<T, N> &inputs) {
+ asm("" ::"r"(&inputs));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- asm("" ::"r"(array_pointer), "llr"(start));
+ asm("" ::"llr"(start));
uint64_t result;
- for (auto input : register_array) {
+ for (auto input : inputs) {
asm("" ::"r"(input));
result = f(input);
- asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+ asm("" ::"r"(result));
}
uint64_t stop = gpu::processor_clock();
>From 47a0f43c93d559596a46b600e9387305b218c85e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 8 Aug 2024 14:08:38 -0400
Subject: [PATCH 5/5] fix asm constraints for amdgpu
---
libc/benchmarks/gpu/timing/amdgpu/timing.h | 42 +++++++++++-----------
1 file changed, 22 insertions(+), 20 deletions(-)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 30b34de4fc87b0..d5c3df27b7de60 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -43,9 +43,6 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;
- // VGPR constraints can only accept primitive values.
- asm("" ::"v"(&arg));
-
// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
// Get the current timestamp from the clock.
@@ -53,15 +50,22 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
- asm("" ::"s"(start), "v"(&arg));
+ asm("" ::"s"(start));
// Run the function under test and return its value.
auto result = f(arg);
// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
- static_cast<uint32_t>(result)));
+ if constexpr (cpp::is_same_v<decltype(result), char> ||
+ cpp::is_same_v<decltype(result), bool>)
+ // AMDGPU does not support input register constraints for i1 and i8, so we
+ // cast it to a 32-bit integer. This does not add an additional assembly
+ // instruction (https://godbolt.org/z/zxGqv8G91).
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ else
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
// Obtain the current timestamp after running the calculation and force
// ordering.
@@ -80,17 +84,19 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;
- asm("" ::"v"(&arg1), "v"(&arg2));
-
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
+ asm("" ::"s"(start));
auto result = f(arg1, arg2);
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
- static_cast<uint32_t>(result)));
+ if constexpr (cpp::is_same_v<decltype(result), char> ||
+ cpp::is_same_v<decltype(result), bool>)
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ else
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
@@ -102,22 +108,18 @@ template <typename F, typename T1, typename T2>
// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
-latency(F f, const cpp::array<T, N> &inputs) {
- volatile auto storage = &inputs;
- auto array_pointer = storage;
- asm("" ::"v"(array_pointer));
- auto register_array = *array_pointer;
+throughput(F f, const cpp::array<T, N> &inputs) {
+ asm("" ::"v"(&inputs));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- asm("" ::"s"(start), "v"(array_pointer));
+ asm("" ::"s"(start));
- for (auto input : register_array) {
+ for (auto input : inputs) {
auto result = f(input);
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
- static_cast<uint32_t>(result)));
+ asm("" ::"v"(result));
}
uint64_t stop = gpu::processor_clock();
More information about the libc-commits
mailing list