[libc-commits] [libc] [libc] DRAFT: Add Generic and NVPTX Sin Benchmark (PR #99795)
via libc-commits
libc-commits at lists.llvm.org
Sun Jul 21 19:01:14 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/99795
>From 8310874813484168a19af1776f9e58ae3674060d Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 17:08:55 -0400
Subject: [PATCH 1/7] implement generic sin benchmark and compare with nvsin
---
libc/benchmarks/gpu/src/CMakeLists.txt | 1 +
libc/benchmarks/gpu/src/math/CMakeLists.txt | 30 ++++++++++++
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 47 +++++++++++++++++++
3 files changed, 78 insertions(+)
create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/src/math/sin_benchmark.cpp
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..b68f728c9e395
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,30 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+ set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+ if (EXISTS ${libdevice_path})
+ set(nvptx_bitcode_link_flags
+ "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+ # Compile definition needed so the benchmark knows to register
+ # NVPTX benchmarks.
+ set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+ endif()
+endif()
+
+add_benchmark(
+ sin_benchmark
+ SUITE
+ libc-gpu-math-benchmarks
+ SRCS
+ sin_benchmark.cpp
+ DEPENDS
+ libc.src.math.sin
+ libc.src.stdlib.srand
+ libc.src.stdlib.rand
+ libc.src.__support.FPUtil.fp_bits
+ COMPILE_OPTIONS
+ ${nvptx_math_found}
+ ${nvptx_bitcode_link_flags}
+ LOADER_ARGS
+ --threads 64
+)
\ No newline at end of file
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..ab65f182d7778
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,47 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+// We want our values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+const int MAX_EXPONENT = 52;
+
+double get_rand(int max_exponent) {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + max_exponent / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+uint64_t BM_Sin() {
+ LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
+ double x = get_rand(MAX_EXPONENT);
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::sin, x);
+}
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_Sin);
+SINGLE_THREADED_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleThread, BM_Sin);
+SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleWave, BM_Sin);
+
+#ifdef NVPTX_MATH_FOUND
+uint64_t BM_NvSin() {
+ LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
+ double x = get_rand(MAX_EXPONENT);
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::__nv_sin, x);
+}
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin, BM_NvSin);
+SINGLE_THREADED_BENCHMARK(lvmLibcSinGpuBenchmark, NvSinSingleThread, BM_NvSin);
+SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinSingleWave, BM_NvSin);
+
+#endif
>From 3a8308059b0346ac9f77de609ccfa26d86e1f218 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 19:22:11 -0400
Subject: [PATCH 2/7] basic generic and nvptx sin benchmark
---
libc/benchmarks/gpu/CMakeLists.txt | 3 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 5 +-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 63 +++++++++++++++++
libc/benchmarks/gpu/src/math/CMakeLists.txt | 1 +
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 70 +++++++++++--------
5 files changed, 109 insertions(+), 33 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 69518acff3a5d..078dd0cbe1536 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -50,13 +50,16 @@ add_unittest_framework_library(
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
+ libc.src.__support.CPP.array
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
+ libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
+ libc.src.stdlib.rand
libc.benchmarks.gpu.timing.timing
libc.src.stdio.printf
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 59de18c20417d..11fd850a0dee6 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -141,9 +141,10 @@ void print_header() {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
- if (id == 0)
+ if (id == 0) {
+ LIBC_NAMESPACE::benchmarks::init_random_input();
print_header();
-
+ }
gpu::sync_threads();
for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c07fab9ccfbe3..2f8d9ccf54c1b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,13 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
#include "src/time/clock.h"
#include <stdint.h>
@@ -105,6 +108,66 @@ class Benchmark {
return benchmark(options, func);
}
};
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+static constexpr int RANDOM_INPUT_SIZE = 1024;
+static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+
+static double get_rand() {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + MAX_EXPONENT / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+static void init_random_input() {
+ for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+ random_input[i] = get_rand();
+ }
+}
+
+template <typename T> class MathPerf {
+ using FPBits = fputil::FPBits<T>;
+ using StorageType = typename FPBits::StorageType;
+ static constexpr StorageType UIntMax =
+ cpp::numeric_limits<StorageType>::max();
+
+public:
+ typedef T Func(T);
+
+ static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+ StorageType ending_bit, StorageType step) {
+ uint64_t total_time = 0;
+ if (step <= 0)
+ step = 1;
+ volatile T result;
+ for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+ T x = FPBits(bits).get_val();
+ total_time += LIBC_NAMESPACE::latency(f, x);
+ }
+ StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+ return total_time / num_runs;
+ }
+
+ static uint64_t run_perf_normal(Func f) {
+ return run_perf_in_range(f, FPBits::min_normal().uintval(),
+ FPBits::max_normal().uintval());
+ }
+
+ static uint64_t run_perf_denormal(Func f) {
+ return run_perf_in_range(f, StorageType(0),
+ FPBits::max_subnormal().uintval());
+ }
+};
+
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index b68f728c9e395..89f698640c791 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -22,6 +22,7 @@ add_benchmark(
libc.src.stdlib.srand
libc.src.stdlib.rand
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.CPP.bit
COMPILE_OPTIONS
${nvptx_math_found}
${nvptx_bitcode_link_flags}
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index ab65f182d7778..c873f94512d20 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,5 +1,7 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/math/sin.h"
#include "src/stdlib/rand.h"
@@ -9,39 +11,45 @@
#include "src/math/nvptx/declarations.h"
#endif
-// We want our values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-const int MAX_EXPONENT = 52;
-
-double get_rand(int max_exponent) {
- using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
- uint64_t bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + max_exponent / 2048.0;
- FPBits fp(bits);
- fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
- return fp.get_val();
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+ return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}
-uint64_t BM_Sin() {
- LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
- double x = get_rand(MAX_EXPONENT);
- return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::sin, x);
-}
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_Sin);
-SINGLE_THREADED_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleThread, BM_Sin);
-SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, SinSingleWave, BM_Sin);
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark as a macro uses a
+// lambda to allow us to switch the implementation of `sin()` to easily register
+// NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func) \
+ []() { \
+ uint64_t total_time = 0; \
+ for (double i : LIBC_NAMESPACE::benchmarks::random_input) { \
+ total_time += LIBC_NAMESPACE::latency(Func, i); \
+ } \
+ return total_time / LIBC_NAMESPACE::benchmarks::random_input.size(); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-#ifdef NVPTX_MATH_FOUND
-uint64_t BM_NvSin() {
- LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::get_thread_id());
- double x = get_rand(MAX_EXPONENT);
- return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::__nv_sin, x);
-}
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin, BM_NvSin);
-SINGLE_THREADED_BENCHMARK(lvmLibcSinGpuBenchmark, NvSinSingleThread, BM_NvSin);
-SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinSingleWave, BM_NvSin);
+#define BM_TWO_PI(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+ BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+ BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
#endif
>From 50988872f7606da7a6413c1c4d97624512eaf0ce Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 20 Jul 2024 23:38:23 -0400
Subject: [PATCH 3/7] minor fixes
---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 10 ----------
libc/benchmarks/gpu/src/math/CMakeLists.txt | 2 +-
libc/benchmarks/gpu/src/math/sin_benchmark.cpp | 6 +++---
3 files changed, 4 insertions(+), 14 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2f8d9ccf54c1b..eb326b870e248 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -156,16 +156,6 @@ template <typename T> class MathPerf {
return total_time / num_runs;
}
-
- static uint64_t run_perf_normal(Func f) {
- return run_perf_in_range(f, FPBits::min_normal().uintval(),
- FPBits::max_normal().uintval());
- }
-
- static uint64_t run_perf_denormal(Func f) {
- return run_perf_in_range(f, StorageType(0),
- FPBits::max_subnormal().uintval());
- }
};
} // namespace benchmarks
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 89f698640c791..2b27652e46ae9 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -28,4 +28,4 @@ add_benchmark(
${nvptx_bitcode_link_flags}
LOADER_ARGS
--threads 64
-)
\ No newline at end of file
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index c873f94512d20..ac35e22b57287 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -17,9 +17,9 @@ uint64_t get_bits(double x) {
}
// BENCHMARK() expects a function that with no parameters that returns a
-// uint64_t representing the latency. Defining each benchmark as a macro uses a
-// lambda to allow us to switch the implementation of `sin()` to easily register
-// NVPTX benchmarks.
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
uint64_t total_time = 0; \
>From 10b1f043e7c9d703569053bfd5e0799c9cf3e6af Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 18:33:18 -0400
Subject: [PATCH 4/7] seed rand with processor clock
---
libc/benchmarks/gpu/CMakeLists.txt | 3 +++
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 24 +++++++++++++++++++
libc/benchmarks/gpu/LibcGpuBenchmark.h | 21 ----------------
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 6 ++---
4 files changed, 30 insertions(+), 24 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 078dd0cbe1536..d1cc58792456c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,6 +22,8 @@ function(add_benchmark benchmark_name)
${BENCHMARK_LINK_LIBRARIES}
DEPENDS
libc.src.stdio.printf
+ libc.src.stdlib.srand
+ libc.src.stdlib.rand
${BENCHMARK_DEPENDS}
${BENCHMARK_UNPARSED_ARGUMENTS}
)
@@ -60,6 +62,7 @@ add_unittest_framework_library(
libc.src.__support.fixedvector
libc.src.time.clock
libc.src.stdlib.rand
+ libc.src.stdlib.srand
libc.benchmarks.gpu.timing.timing
libc.src.stdio.printf
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 11fd850a0dee6..47ff40aea9beb 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,6 +8,7 @@
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/stdio/printf.h"
+#include "src/stdlib/srand.h"
#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE_DECL {
@@ -138,6 +139,29 @@ void print_header() {
"--------------------------------\n");
}
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+
+static double get_rand() {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + MAX_EXPONENT / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+static void init_random_input() {
+ LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+ for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+ random_input[i] = get_rand();
+ }
+}
+
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index eb326b870e248..954041f7fc0a5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -109,30 +109,9 @@ class Benchmark {
}
};
-// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-static constexpr int MAX_EXPONENT = 52;
static constexpr int RANDOM_INPUT_SIZE = 1024;
static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
-static double get_rand() {
- using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
- uint64_t bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + MAX_EXPONENT / 2048.0;
- FPBits fp(bits);
- fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
- return fp.get_val();
-}
-
-static void init_random_input() {
- for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
- random_input[i] = get_rand();
- }
-}
-
template <typename T> class MathPerf {
using FPBits = fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index ac35e22b57287..845d9eae5642a 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -16,10 +16,10 @@ uint64_t get_bits(double x) {
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}
-// BENCHMARK() expects a function that with no parameters that returns a
+// BENCHMARK() expects a function with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` to
-// easily register NVPTX benchmarks.
+// expands to a lambda to allow us to switch the implementation of `sin()` and
+// easily register vendor-specific benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
uint64_t total_time = 0; \
>From 686ddfb369adc503e01b0922e8e395a3fb4ac34c Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:49:10 -0400
Subject: [PATCH 5/7] move random array to per thread
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 25 ------------------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 26 +++++++++++++++++--
libc/benchmarks/gpu/src/math/CMakeLists.txt | 1 +
.../benchmarks/gpu/src/math/sin_benchmark.cpp | 15 +++++++----
4 files changed, 35 insertions(+), 32 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 47ff40aea9beb..e5a778bc4ad92 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,7 +8,6 @@
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/stdio/printf.h"
-#include "src/stdlib/srand.h"
#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE_DECL {
@@ -139,34 +138,10 @@ void print_header() {
"--------------------------------\n");
}
-// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-// The largest integer that can be stored in a double is 2^53
-static constexpr int MAX_EXPONENT = 52;
-
-static double get_rand() {
- using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
- uint64_t bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + MAX_EXPONENT / 2048.0;
- FPBits fp(bits);
- fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
- return fp.get_val();
-}
-
-static void init_random_input() {
- LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
- for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
- random_input[i] = get_rand();
- }
-}
-
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
if (id == 0) {
- LIBC_NAMESPACE::benchmarks::init_random_input();
print_header();
}
gpu::sync_threads();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 954041f7fc0a5..7dadabdda5895 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -10,6 +10,7 @@
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
#include "src/time/clock.h"
#include <stdint.h>
@@ -109,8 +110,29 @@ class Benchmark {
}
};
-static constexpr int RANDOM_INPUT_SIZE = 1024;
-static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+
+static double get_rand_double() {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + MAX_EXPONENT / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+template <size_t Size>
+static void init_random_double_input(cpp::array<double, Size> &values) {
+ LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+ for (int i = 0; i < Size; i++) {
+ values[i] = get_rand_double();
+ }
+}
template <typename T> class MathPerf {
using FPBits = fputil::FPBits<T>;
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 2b27652e46ae9..116dfb580215a 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -23,6 +23,7 @@ add_benchmark(
libc.src.stdlib.rand
libc.src.__support.FPUtil.fp_bits
libc.src.__support.CPP.bit
+ libc.src.__support.CPP.array
COMPILE_OPTIONS
${nvptx_math_found}
${nvptx_bitcode_link_flags}
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 845d9eae5642a..f0b218ef047e5 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -1,5 +1,6 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/FPUtil/FPBits.h"
@@ -16,17 +17,21 @@ uint64_t get_bits(double x) {
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}
-// BENCHMARK() expects a function with no parameters that returns a
+constexpr int RANDOM_INPUT_SIZE = 256;
+
+// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
-// expands to a lambda to allow us to switch the implementation of `sin()` and
-// easily register vendor-specific benchmarks.
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
+ LIBC_NAMESPACE::cpp::array<double, RANDOM_INPUT_SIZE> random_input; \
+ LIBC_NAMESPACE::benchmarks::init_random_double_input(random_input); \
uint64_t total_time = 0; \
- for (double i : LIBC_NAMESPACE::benchmarks::random_input) { \
+ for (double i : random_input) { \
total_time += LIBC_NAMESPACE::latency(Func, i); \
} \
- return total_time / LIBC_NAMESPACE::benchmarks::random_input.size(); \
+ return total_time / random_input.size(); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
>From 4ae60e80737b137b758c28713594d2ea5a190df2 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:52:39 -0400
Subject: [PATCH 6/7] use single variable instead of array
---
libc/benchmarks/gpu/src/math/sin_benchmark.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index f0b218ef047e5..4d0ecea0f7cdd 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -25,13 +25,8 @@ constexpr int RANDOM_INPUT_SIZE = 256;
// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
- LIBC_NAMESPACE::cpp::array<double, RANDOM_INPUT_SIZE> random_input; \
- LIBC_NAMESPACE::benchmarks::init_random_double_input(random_input); \
- uint64_t total_time = 0; \
- for (double i : random_input) { \
- total_time += LIBC_NAMESPACE::latency(Func, i); \
- } \
- return total_time / random_input.size(); \
+ double x = LIBC_NAMESPACE::benchmarks::get_rand_double(); \
+ return LIBC_NAMESPACE::latency(Func, x); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
>From 81f86e5663d46487b0ff45ac1b2a7f01f595c5ca Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 21 Jul 2024 21:56:51 -0400
Subject: [PATCH 7/7] allow AMDGPU to store doubles to register
---
libc/benchmarks/gpu/timing/amdgpu/timing.h | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 9b40f9282b16b..bfba4043c2505 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -67,7 +67,9 @@ template <typename F, typename T>
// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result))
+ :);
// Obtain the current timestamp after running the calculation and force
// ordering.
@@ -98,7 +100,9 @@ template <typename F, typename T1, typename T2>
auto result = f(arg1, arg2);
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result))
+ :);
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
More information about the libc-commits
mailing list