[libc-commits] [libc] 9a070d6 - [libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (#101917)
via libc-commits
libc-commits at lists.llvm.org
Thu Aug 8 13:05:37 PDT 2024
Author: jameshu15869
Date: 2024-08-08T15:05:34-05:00
New Revision: 9a070d6d0f0c111c2269a912f98908c821993e37
URL: https://github.com/llvm/llvm-project/commit/9a070d6d0f0c111c2269a912f98908c821993e37
DIFF: https://github.com/llvm/llvm-project/commit/9a070d6d0f0c111c2269a912f98908c821993e37.diff
LOG: [libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (#101917)
This PR implements
https://github.com/lntue/llvm-project/commit/2a158426d4b90ffaa3eaecc9bc10e5aed11f1bcf
to provide better throughput benchmarking for libc `sin()` and
`__nv_sin()`.
These changes have not been tested on AMDGPU yet, only compiled.
Added:
Modified:
libc/benchmarks/gpu/LibcGpuBenchmark.h
libc/benchmarks/gpu/src/math/sin_benchmark.cpp
libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
libc/benchmarks/gpu/timing/amdgpu/timing.h
libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
libc/benchmarks/gpu/timing/nvptx/timing.h
Removed:
################################################################################
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed7459..830e6f9e89a743 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
- uint32_t min_iterations = 50;
+ uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
};
// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+// -EXP_BIAS corresponding to denormal values,
+// EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+ int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
// Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+ double scale =
+ static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
public:
typedef T Func(T);
- static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
- StorageType ending_bit, StorageType step) {
- uint64_t total_time = 0;
- if (step <= 0)
- step = 1;
- volatile T result;
- for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
- T x = FPBits(bits).get_val();
- total_time += LIBC_NAMESPACE::latency(f, x);
- }
- StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
- return total_time / num_runs;
+ template <size_t N = 1>
+ static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+ cpp::array<T, N> inputs;
+ for (size_t i = 0; i < N; ++i)
+ inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+ uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
+
+ return total_time / N;
}
};
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
-
#endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb09..e86961790b9438 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
#include "src/math/amdgpu/declarations.h"
#endif
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
- return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func) \
- []() { \
- double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
- return LIBC_NAMESPACE::latency(Func, x); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func) \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
+ return LIBC_NAMESPACE::benchmarks::MathPerf< \
+ double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP); \
}
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
-#define BM_LARGE_INT(Func) \
- []() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
#ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09ae..aa5dcd33bee9c8 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e95695..d5c3df27b7de60 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
@@ -17,14 +18,6 @@
#include <stdint.h>
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
- if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
- asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
- else \
- asm("" ::"v"(VARIABLE))
-
namespace LIBC_NAMESPACE_DECL {
// Returns the overhead associated with calling the profiling region. This
@@ -50,8 +43,6 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;
- FORCE_TO_REGISTER(T, arg);
-
// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
// Get the current timestamp from the clock.
@@ -59,7 +50,6 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
- FORCE_TO_REGISTER(T, arg);
asm("" ::"s"(start));
// Run the function under test and return its value.
@@ -67,8 +57,15 @@ template <typename F, typename T>
// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
- static_cast<uint32_t>(result)));
+ if constexpr (cpp::is_same_v<decltype(result), char> ||
+ cpp::is_same_v<decltype(result), bool>)
+ // AMDGPU does not support input register constraints for i1 and i8, so we
+ // cast it to a 32-bit integer. This does not add an additional assembly
+ // instruction (https://godbolt.org/z/zxGqv8G91).
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ else
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
// Obtain the current timestamp after running the calculation and force
// ordering.
@@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
-
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
asm("" ::"s"(start));
auto result = f(arg1, arg2);
- asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
- static_cast<uint32_t>(result)));
+ if constexpr (cpp::is_same_v<decltype(result), char> ||
+ cpp::is_same_v<decltype(result), bool>)
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ else
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
@@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
return stop - start;
}
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+ asm("" ::"v"(&inputs));
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"s"(start));
+
+ for (auto input : inputs) {
+ auto result = f(input);
+
+ asm("" ::"v"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a410..2723c8940814c6 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea1535..637986abd9092d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
@@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
volatile uint32_t x = 1;
uint32_t y = x;
uint64_t start = gpu::processor_clock();
- asm("" ::"r"(y), "llr"(start));
+ asm("" ::"llr"(start));
uint32_t result = y;
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
uint64_t stop = gpu::processor_clock();
@@ -42,7 +44,6 @@ template <typename F, typename T>
// not constant propagate it and remove the profiling region.
volatile T storage = t;
T arg = storage;
- asm("" ::"r"(arg));
// Get the current timestamp from the clock.
gpu::memory_fence();
@@ -50,7 +51,7 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock cycle
// counter before the profiling region.
- asm("" ::"r"(arg), "llr"(start));
+ asm("" ::"llr"(start));
// Run the function under test and return its value.
auto result = f(arg);
@@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T2 storage2 = t2;
T1 arg = storage;
T2 arg2 = storage2;
- asm("" ::"r"(arg), "r"(arg2));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+ asm("" ::"llr"(start));
auto result = f(arg, arg2);
@@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
return stop - start;
}
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+ asm("" ::"r"(&inputs));
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"llr"(start));
+
+ uint64_t result;
+ for (auto input : inputs) {
+ asm("" ::"r"(input));
+ result = f(input);
+ asm("" ::"r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::memory_fence();
+ asm("" ::"r"(stop));
+ volatile auto output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
More information about the libc-commits
mailing list