[libc-commits] [libc] [libc] DRAFT: Add Generic and NVPTX Sin Benchmark (PR #99795)
via libc-commits
libc-commits at lists.llvm.org
Sat Jul 20 20:39:18 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: None (jameshu15869)
<details>
<summary>Changes</summary>
This PR adds sin benchmarking for a range of values and on a pregenerated random distribution.
---
Full diff: https://github.com/llvm/llvm-project/pull/99795.diff
6 Files Affected:
- (modified) libc/benchmarks/gpu/CMakeLists.txt (+3)
- (modified) libc/benchmarks/gpu/LibcGpuBenchmark.cpp (+3)
- (modified) libc/benchmarks/gpu/LibcGpuBenchmark.h (+53)
- (modified) libc/benchmarks/gpu/src/CMakeLists.txt (+1)
- (added) libc/benchmarks/gpu/src/math/CMakeLists.txt (+31)
- (added) libc/benchmarks/gpu/src/math/sin_benchmark.cpp (+55)
``````````diff
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 29e27724e1ab3..ba6958a0c68d5 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -47,13 +47,16 @@ add_unittest_framework_library(
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
+ libc.src.__support.CPP.array
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
+ libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
+ libc.src.stdlib.rand
libc.benchmarks.gpu.timing.timing
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c926d8efd7db2..05a6621036b0b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -107,6 +107,9 @@ void print_results(Benchmark *b) {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
+ if (id == 0) {
+ LIBC_NAMESPACE::benchmarks::init_random_input();
+ }
gpu::sync_threads();
for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 29d7ba8b0a132..5d84959e17c4b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,13 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
#include "src/time/clock.h"
#include <stdint.h>
@@ -102,6 +105,56 @@ class Benchmark {
return benchmark(options, func);
}
};
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+static constexpr int RANDOM_INPUT_SIZE = 1024;
+static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+
+static double get_rand() {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + MAX_EXPONENT / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+static void init_random_input() {
+ for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+ random_input[i] = get_rand();
+ }
+}
+
+template <typename T> class MathPerf {
+ using FPBits = fputil::FPBits<T>;
+ using StorageType = typename FPBits::StorageType;
+ static constexpr StorageType UIntMax =
+ cpp::numeric_limits<StorageType>::max();
+
+public:
+ typedef T Func(T);
+
+ static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+ StorageType ending_bit, StorageType step) {
+ uint64_t total_time = 0;
+ if (step <= 0)
+ step = 1;
+ volatile T result;
+ for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+ T x = FPBits(bits).get_val();
+ total_time += LIBC_NAMESPACE::latency(f, x);
+ }
+ StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+ return total_time / num_runs;
+ }
+};
+
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..2b27652e46ae9
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+ set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+ if (EXISTS ${libdevice_path})
+ set(nvptx_bitcode_link_flags
+ "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+ # Compile definition needed so the benchmark knows to register
+ # NVPTX benchmarks.
+ set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+ endif()
+endif()
+
+add_benchmark(
+ sin_benchmark
+ SUITE
+ libc-gpu-math-benchmarks
+ SRCS
+ sin_benchmark.cpp
+ DEPENDS
+ libc.src.math.sin
+ libc.src.stdlib.srand
+ libc.src.stdlib.rand
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.CPP.bit
+ COMPILE_OPTIONS
+ ${nvptx_math_found}
+ ${nvptx_bitcode_link_flags}
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..ac35e22b57287
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,55 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+ return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func) \
+ []() { \
+ uint64_t total_time = 0; \
+ for (double i : LIBC_NAMESPACE::benchmarks::random_input) { \
+ total_time += LIBC_NAMESPACE::latency(Func, i); \
+ } \
+ return total_time / LIBC_NAMESPACE::benchmarks::random_input.size(); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+ BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+ BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif
``````````
</details>
https://github.com/llvm/llvm-project/pull/99795
More information about the libc-commits
mailing list