[libc-commits] [libc] de59e7b - [libc] Fix GPU benchmarking
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Fri Jul 18 12:36:28 PDT 2025
Author: Joseph Huber
Date: 2025-07-18T14:36:23-05:00
New Revision: de59e7b86cd349f9f74b7561594aeae410477326
URL: https://github.com/llvm/llvm-project/commit/de59e7b86cd349f9f74b7561594aeae410477326
DIFF: https://github.com/llvm/llvm-project/commit/de59e7b86cd349f9f74b7561594aeae410477326.diff
LOG: [libc] Fix GPU benchmarking
Added:
libc/benchmarks/gpu/src/math/platform.h
Modified:
libc/benchmarks/gpu/LibcGpuBenchmark.cpp
libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
libc/benchmarks/gpu/src/math/sin_benchmark.cpp
libc/benchmarks/gpu/timing/amdgpu/timing.h
libc/benchmarks/gpu/timing/nvptx/timing.h
Removed:
################################################################################
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 920c5b206b0fe..57ff5b9fdb846 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -7,9 +7,9 @@
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
+#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
#include "src/stdlib/srand.h"
-#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
index 3bb5b0cc6788c..1f91a9a35c373 100644
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -3,12 +3,8 @@
#include "src/math/atan2.h"
#include "src/stdlib/rand.h"
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
#endif
#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
@@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
#ifdef NVPTX_MATH_FOUND
-BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
-BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
-BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
-BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
-BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
-BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
-BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
#endif
diff --git a/libc/benchmarks/gpu/src/math/platform.h b/libc/benchmarks/gpu/src/math/platform.h
new file mode 100644
index 0000000000000..bb7825d38bd42
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/platform.h
@@ -0,0 +1,57 @@
+//===-- AMDGPU specific platform definitions for math support -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
+// The ROCm device library uses control globals to alter codegen for the
+//
diff erent targets. To avoid needing to link them in manually we simply
+// define them here.
+extern "C" {
+extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+ __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+#endif
+} // namespace LIBC_NAMESPACE_DECL
+
+// Forward declarations for the vendor math libraries.
+extern "C" {
+#ifdef AMDGPU_MATH_FOUND
+double __ocml_sin_f64(double);
+float __ocml_sin_f32(float);
+double __ocml_atan2_f64(double, double);
+float __ocml_atan2_f32(float, float);
+#endif
+
+#ifdef NVPTX_MATH_FOUND
+double __nv_sin(double);
+float __nv_sinf(float);
+double __nv_atan2(double, double);
+float __nv_atan2f(float, float);
+#endif
+}
+
+#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index bf09e6e462172..a759db2e9d33f 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -8,12 +8,8 @@
#include "src/math/sinf.h"
#include "src/stdlib/rand.h"
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
#endif
// BENCHMARK() expects a function that with no parameters that returns a
@@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
#ifdef NVPTX_MATH_FOUND
-BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
-BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
-BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
-BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+BENCH(double, NvSin, __nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
#endif
BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
@@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
#ifdef NVPTX_MATH_FOUND
-BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
-BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
-BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
-BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
+BENCH(float, NvSinf, __nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
-BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
-BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
-BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
+BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 4cf7e9838add3..0f2c04c07c921 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
@@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
// allows us to substract the constant-time overhead from the latency to
// obtain a true result. This can vary with system load.
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
uint32_t result = 0.0;
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
@@ -44,13 +45,13 @@ template <typename F, typename T>
T arg = storage;
// The AMDGPU architecture needs to wait on pending results.
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
// Get the current timestamp from the clock.
uint64_t start = gpu::processor_clock();
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
- asm("" ::"s"(start));
+ asm("" : "+v"(arg) : "s"(start));
// Run the function under test and return its value.
auto result = f(arg);
@@ -71,7 +72,7 @@ template <typename F, typename T>
// ordering.
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
// Return the time elapsed.
return stop - start;
@@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"s"(start));
@@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
return stop - start;
}
@@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"v"(&inputs));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"s"(start));
@@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
// Return the time elapsed.
return stop - start;
@@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
asm("" ::"v"(&inputs1), "v"(&inputs2));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"s"(start));
@@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
// Return the time elapsed.
return stop - start;
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index ece7d9a6c5396..3ed97645ddc93 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
@@ -46,7 +47,7 @@ template <typename F, typename T>
T arg = storage;
// Get the current timestamp from the clock.
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
// This forces the compiler to load the input argument and run the clock cycle
@@ -63,7 +64,7 @@ template <typename F, typename T>
// Obtain the current timestamp after running the calculation and force
// ordering.
uint64_t stop = gpu::processor_clock();
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
volatile T output = result;
@@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
T1 arg = storage;
T2 arg2 = storage2;
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"llr"(start));
@@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
uint64_t stop = gpu::processor_clock();
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
volatile auto output = result;
@@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"r"(&inputs));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"llr"(start));
@@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
}
uint64_t stop = gpu::processor_clock();
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
volatile auto output = result;
@@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
asm("" ::"r"(&inputs1), "r"(&inputs2));
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
uint64_t start = gpu::processor_clock();
asm("" ::"llr"(start));
@@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
}
uint64_t stop = gpu::processor_clock();
- gpu::memory_fence();
+ cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
volatile auto output = result;
More information about the libc-commits
mailing list