[libc-commits] [libc] [libc][gpu] Add Atan2 Benchmarks (PR #104708)
via libc-commits
libc-commits at lists.llvm.org
Sun Aug 18 07:22:31 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/104708
>From 6ad03fa39c35c0f104ee4786835de83cb4296918 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 18 Aug 2024 10:06:58 -0400
Subject: [PATCH 1/2] add atan2 benchmarks
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 4 +-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 21 +++++++--
libc/benchmarks/gpu/src/math/CMakeLists.txt | 19 ++++++++
.../gpu/src/math/atan2_benchmark.cpp | 47 +++++++++++++++++++
libc/benchmarks/gpu/timing/amdgpu/timing.h | 25 ++++++++++
libc/benchmarks/gpu/timing/nvptx/timing.h | 29 ++++++++++++
6 files changed, 140 insertions(+), 5 deletions(-)
create mode 100644 libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f237e2ea1b9545..920c5b206b0fef 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
LIBC_NAMESPACE::printf(
- "%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
+ "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
b->get_test_name().data(), result.cycles, result.min, result.max,
result.total_iterations, result.total_time, time_unit,
static_cast<uint64_t>(result.standard_deviation), num_threads);
@@ -127,7 +127,7 @@ void print_header() {
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles =
- "Benchmark | Cycles | Min | Max | "
+ "Benchmark | Cycles | Min | Max | "
"Iterations | Time / Iteration | Stddev | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 830e6f9e89a743..f2cfbfbfdcdf0d 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -146,10 +146,8 @@ template <typename T> class MathPerf {
cpp::numeric_limits<StorageType>::max();
public:
- typedef T Func(T);
-
template <size_t N = 1>
- static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+ static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);
@@ -158,6 +156,23 @@ template <typename T> class MathPerf {
return total_time / N;
}
+
+ // Throughput benchmarking for functions that take 2 inputs.
+ template <size_t N = 1>
+ static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
+ int arg1_max_exp, int arg2_min_exp,
+ int arg2_max_exp) {
+ cpp::array<T, N> inputs1;
+ cpp::array<T, N> inputs2;
+ for (size_t i = 0; i < N; ++i) {
+ inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
+ inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+ }
+
+ uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
+
+ return total_time / N;
+ }
};
} // namespace benchmarks
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 335da5ad71cf88..6870c024490166 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -43,3 +43,22 @@ add_benchmark(
LOADER_ARGS
--threads 64
)
+
+add_benchmark(
+ atan2_benchmark
+ SUITE
+ libc-gpu-math-benchmarks
+ SRCS
+ atan2_benchmark.cpp
+ DEPENDS
+ libc.src.math.atan2
+ libc.src.stdlib.srand
+ libc.src.stdlib.rand
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.CPP.bit
+ libc.src.__support.CPP.array
+ COMPILE_OPTIONS
+ ${math_benchmark_flags}
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
new file mode 100644
index 00000000000000..3bb5b0cc6788ca
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -0,0 +1,47 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/math/atan2.h"
+#include "src/stdlib/rand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+#include "src/math/amdgpu/declarations.h"
+#endif
+
+#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
+ N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \
+ }
+
+#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \
+ BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \
+ BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \
+ BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \
+ BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
+BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
+BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
+BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
+
+#ifdef NVPTX_MATH_FOUND
+BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index d5c3df27b7de60..4cf7e9838add34 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
return stop - start;
}
+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+ F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+ asm("" ::"v"(&inputs1), "v"(&inputs2));
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"s"(start));
+
+ for (size_t i = 0; i < inputs1.size(); i++) {
+ auto result = f(inputs1[i], inputs2[i]);
+
+ asm("" ::"v"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 637986abd9092d..5dc5076e6acc45 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -121,6 +121,35 @@ throughput(F f, const cpp::array<T, N> &inputs) {
// Return the time elapsed.
return stop - start;
}
+
+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+ F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+ asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"llr"(start));
+
+ uint64_t result;
+ for (size_t i = 0; i < inputs1.size(); i++) {
+ auto arg1 = inputs1[i];
+ auto arg2 = inputs2[i];
+ asm("" ::"r"(arg1), "r"(arg2));
+ result = f(arg1, arg2);
+ asm("" ::"r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::memory_fence();
+ asm("" ::"r"(stop));
+ volatile auto output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
>From 24a906de752abc1dcae961b15a8c4488df9b587a Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 18 Aug 2024 10:22:20 -0400
Subject: [PATCH 2/2] remove unnecessary asm constraint
---
libc/benchmarks/gpu/timing/nvptx/timing.h | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 5dc5076e6acc45..ece7d9a6c5396c 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -135,10 +135,7 @@ template <typename F, typename T, size_t N>
uint64_t result;
for (size_t i = 0; i < inputs1.size(); i++) {
- auto arg1 = inputs1[i];
- auto arg2 = inputs2[i];
- asm("" ::"r"(arg1), "r"(arg2));
- result = f(arg1, arg2);
+ result = f(inputs1[i], inputs2[i]);
asm("" ::"r"(result));
}
More information about the libc-commits
mailing list