[llvm] [Offload][Conformance] Add support for CUDA Math and HIP Math providers (PR #152362)
Leandro Lacerda via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 12:45:58 PDT 2025
https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/152362
>From 29602ceb8448fdcd2b175c0c33417351ac4a7e0b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 6 Aug 2025 14:57:31 -0300
Subject: [PATCH 1/3] Redirect test preamble output to `stderr`
---
offload/unittests/Conformance/include/mathtest/TestRunner.hpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/unittests/Conformance/include/mathtest/TestRunner.hpp b/offload/unittests/Conformance/include/mathtest/TestRunner.hpp
index f89d151d0161e..ab17f1d83768a 100644
--- a/offload/unittests/Conformance/include/mathtest/TestRunner.hpp
+++ b/offload/unittests/Conformance/include/mathtest/TestRunner.hpp
@@ -41,11 +41,11 @@ void printPreamble(const TestConfig &Config, size_t Index,
size_t Total) noexcept {
using FunctionConfig = FunctionConfig<Func>;
- llvm::outs() << "[" << (Index + 1) << "/" << Total << "] "
+ llvm::errs() << "[" << (Index + 1) << "/" << Total << "] "
<< "Running conformance test '" << FunctionConfig::Name
<< "' with '" << Config.Provider << "' on '" << Config.Platform
<< "'\n";
- llvm::outs().flush();
+ llvm::errs().flush();
}
template <typename T>
>From 2f7887c12f4901dbc9a00fafaaff28947f3f00fc Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 6 Aug 2025 15:26:09 -0300
Subject: [PATCH 2/3] Add support for `cuda-math` and `hip-math` providers
---
.../Conformance/device_code/CMakeLists.txt | 28 ++-
.../Conformance/device_code/CUDAMath.cpp | 178 ++++++++++++++++++
.../Conformance/device_code/DeviceAPIs.hpp | 113 +++++++++++
.../Conformance/device_code/HIPMath.cpp | 178 ++++++++++++++++++
.../{Common.hpp => KernelRunner.hpp} | 16 +-
.../Conformance/device_code/LLVMLibm.cpp | 5 +-
6 files changed, 506 insertions(+), 12 deletions(-)
create mode 100644 offload/unittests/Conformance/device_code/CUDAMath.cpp
create mode 100644 offload/unittests/Conformance/device_code/DeviceAPIs.hpp
create mode 100644 offload/unittests/Conformance/device_code/HIPMath.cpp
rename offload/unittests/Conformance/device_code/{Common.hpp => KernelRunner.hpp} (70%)
diff --git a/offload/unittests/Conformance/device_code/CMakeLists.txt b/offload/unittests/Conformance/device_code/CMakeLists.txt
index 789dd167bb9ff..992f54c0c2376 100644
--- a/offload/unittests/Conformance/device_code/CMakeLists.txt
+++ b/offload/unittests/Conformance/device_code/CMakeLists.txt
@@ -1,4 +1,30 @@
+set(cuda_math_flags "")
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+ file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc")
+ list(GET libdevice_paths 0 libdevice_path)
+
+ if (EXISTS ${libdevice_path})
+ list(APPEND cuda_math_flags "-Xclang" "-mlink-builtin-bitcode" "-Xclang" "${libdevice_path}")
+ list(APPEND cuda_math_flags "-DCUDA_MATH_FOUND=1")
+ endif()
+endif()
+
+set(hip_math_flags "")
+find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if(AMDDeviceLibs_FOUND)
+ get_target_property(ocml_path ocml IMPORTED_LOCATION)
+ list(APPEND hip_math_flags "-Xclang" "-mlink-builtin-bitcode" "-Xclang" "${ocml_path}")
+ list(APPEND hip_math_flags "-DHIP_MATH_FOUND=1")
+endif()
+
+add_offload_test_device_code(CUDAMath.cpp cuda-math -O3 -stdlib -fno-builtin ${cuda_math_flags})
+add_offload_test_device_code(HIPMath.cpp hip-math -O3 -stdlib -fno-builtin ${hip_math_flags})
add_offload_test_device_code(LLVMLibm.cpp llvm-libm -O3 -stdlib -fno-builtin)
-add_custom_target(conformance_device_binaries DEPENDS llvm-libm.bin)
+add_custom_target(conformance_device_binaries DEPENDS
+ cuda-math.bin
+ hip-math.bin
+ llvm-libm.bin
+)
set(OFFLOAD_CONFORMANCE_DEVICE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/Conformance/device_code/CUDAMath.cpp b/offload/unittests/Conformance/device_code/CUDAMath.cpp
new file mode 100644
index 0000000000000..a351e924b8f89
--- /dev/null
+++ b/offload/unittests/Conformance/device_code/CUDAMath.cpp
@@ -0,0 +1,178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the device kernels that wrap the
+/// math functions from the cuda-math provider.
+///
+//===----------------------------------------------------------------------===//
+
+#ifdef CUDA_MATH_FOUND
+
+#include "Conformance/device_code/DeviceAPIs.hpp"
+#include "Conformance/device_code/KernelRunner.hpp"
+
+#include <gpuintrin.h>
+#include <stddef.h>
+
+using namespace kernels;
+
+//===----------------------------------------------------------------------===//
+// Helpers
+//===----------------------------------------------------------------------===//
+
+static inline float sincosfSin(float X) {
+ float SinX, CosX;
+ __nv_sincosf(X, &SinX, &CosX);
+ return SinX;
+}
+
+static inline float sincosfCos(float X) {
+ float SinX, CosX;
+ __nv_sincosf(X, &SinX, &CosX);
+ return CosX;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernels
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+
+__gpu_kernel void acosfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_acosf>(NumElements, Out, X);
+}
+
+__gpu_kernel void acoshfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_acoshf>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_asinf>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_asinhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_atanf>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_atanhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_cbrtf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cosfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_cosf>(NumElements, Out, X);
+}
+
+__gpu_kernel void coshfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_coshf>(NumElements, Out, X);
+}
+
+__gpu_kernel void cospifKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_cospif>(NumElements, Out, X);
+}
+
+__gpu_kernel void erffKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_erff>(NumElements, Out, X);
+}
+
+__gpu_kernel void expfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_expf>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_exp10f>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_exp2f>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_expm1f>(NumElements, Out, X);
+}
+
+__gpu_kernel void logfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_logf>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_log10f>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_log1pf>(NumElements, Out, X);
+}
+
+__gpu_kernel void log2fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_log2f>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_sinf>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfSinKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<sincosfSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfCosKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<sincosfCos>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_sinhf>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinpifKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_sinpif>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_tanf>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__nv_tanhf>(NumElements, Out, X);
+}
+} // extern "C"
+
+#endif // CUDA_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/DeviceAPIs.hpp b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
new file mode 100644
index 0000000000000..8476dcbeff0c9
--- /dev/null
+++ b/offload/unittests/Conformance/device_code/DeviceAPIs.hpp
@@ -0,0 +1,113 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains platform-specific definitions and forward declarations
+/// for device-side APIs used by the kernels.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
+#define CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
+
+#include <stdint.h>
+
+typedef _Float16 float16;
+
+#ifdef __AMDGPU__
+
+// The ROCm device library uses control globals to alter codegen for the
+// different targets. To avoid needing to link them in manually, we simply
+// define them here.
+extern "C" {
+extern const inline uint8_t __oclc_unsafe_math_opt = 0;
+extern const inline uint8_t __oclc_daz_opt = 0;
+extern const inline uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const inline uint8_t __oclc_finite_only_opt = 0;
+extern const inline uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause Clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+ __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+
+#endif // __AMDGPU__
+
+#ifdef CUDA_MATH_FOUND
+
+extern "C" {
+
+float __nv_acosf(float);
+float __nv_acoshf(float);
+float __nv_asinf(float);
+float __nv_asinhf(float);
+float __nv_atanf(float);
+float __nv_atanhf(float);
+float __nv_cbrtf(float);
+float __nv_cosf(float);
+float __nv_coshf(float);
+float __nv_cospif(float);
+float __nv_erff(float);
+float __nv_expf(float);
+float __nv_exp10f(float);
+float __nv_exp2f(float);
+float __nv_expm1f(float);
+float __nv_logf(float);
+float __nv_log10f(float);
+float __nv_log1pf(float);
+float __nv_log2f(float);
+float __nv_sinf(float);
+void __nv_sincosf(float, float *, float *);
+float __nv_sinhf(float);
+float __nv_sinpif(float);
+float __nv_tanf(float);
+float __nv_tanhf(float);
+} // extern "C"
+
+#endif // CUDA_MATH_FOUND
+
+#ifdef HIP_MATH_FOUND
+
+extern "C" {
+
+float __ocml_acos_f32(float);
+float __ocml_acosh_f32(float);
+float __ocml_asin_f32(float);
+float __ocml_asinh_f32(float);
+float __ocml_atan_f32(float);
+float __ocml_atanh_f32(float);
+float __ocml_cbrt_f32(float);
+float __ocml_cos_f32(float);
+float __ocml_cosh_f32(float);
+float __ocml_cospi_f32(float);
+float __ocml_erf_f32(float);
+float __ocml_exp_f32(float);
+float __ocml_exp10_f32(float);
+float __ocml_exp2_f32(float);
+float __ocml_expm1_f32(float);
+float __ocml_log_f32(float);
+float __ocml_log10_f32(float);
+float __ocml_log1p_f32(float);
+float __ocml_log2_f32(float);
+float __ocml_sin_f32(float);
+float __ocml_sincos_f32(float, float *);
+float __ocml_sinh_f32(float);
+float __ocml_sinpi_f32(float);
+float __ocml_tan_f32(float);
+float __ocml_tanh_f32(float);
+} // extern "C"
+
+#endif // HIP_MATH_FOUND
+
+#endif // CONFORMANCE_DEVICE_CODE_DEVICEAPIS_HPP
diff --git a/offload/unittests/Conformance/device_code/HIPMath.cpp b/offload/unittests/Conformance/device_code/HIPMath.cpp
new file mode 100644
index 0000000000000..36efe6b2696ab
--- /dev/null
+++ b/offload/unittests/Conformance/device_code/HIPMath.cpp
@@ -0,0 +1,178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the device kernels that wrap the
+/// math functions from the hip-math provider.
+///
+//===----------------------------------------------------------------------===//
+
+#ifdef HIP_MATH_FOUND
+
+#include "Conformance/device_code/DeviceAPIs.hpp"
+#include "Conformance/device_code/KernelRunner.hpp"
+
+#include <gpuintrin.h>
+#include <stddef.h>
+
+using namespace kernels;
+
+//===----------------------------------------------------------------------===//
+// Helpers
+//===----------------------------------------------------------------------===//
+
+static inline float sincosfSin(float X) {
+ float CosX;
+ float SinX = __ocml_sincos_f32(X, &CosX);
+ return SinX;
+}
+
+static inline float sincosfCos(float X) {
+ float CosX;
+ float SinX = __ocml_sincos_f32(X, &CosX);
+ return CosX;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernels
+//===----------------------------------------------------------------------===//
+
+extern "C" {
+
+__gpu_kernel void acosfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_acos_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void acoshfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_acosh_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_asin_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void asinhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_asinh_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_atan_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void atanhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_atanh_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void cbrtfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_cbrt_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void cosfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_cos_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void coshfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_cosh_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void cospifKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_cospi_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void erffKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_erf_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void expfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_exp_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp10fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_exp10_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void exp2fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_exp2_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void expm1fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_expm1_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void logfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_log_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void log10fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_log10_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void log1pfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_log1p_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void log2fKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_log2_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_sin_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfSinKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<sincosfSin>(NumElements, Out, X);
+}
+
+__gpu_kernel void sincosfCosKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<sincosfCos>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_sinh_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void sinpifKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_sinpi_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_tan_f32>(NumElements, Out, X);
+}
+
+__gpu_kernel void tanhfKernel(const float *X, float *Out,
+ size_t NumElements) noexcept {
+ runKernelBody<__ocml_tanh_f32>(NumElements, Out, X);
+}
+} // extern "C"
+
+#endif // HIP_MATH_FOUND
diff --git a/offload/unittests/Conformance/device_code/Common.hpp b/offload/unittests/Conformance/device_code/KernelRunner.hpp
similarity index 70%
rename from offload/unittests/Conformance/device_code/Common.hpp
rename to offload/unittests/Conformance/device_code/KernelRunner.hpp
index bcf3ac617b54c..e64a62fbdf018 100644
--- a/offload/unittests/Conformance/device_code/Common.hpp
+++ b/offload/unittests/Conformance/device_code/KernelRunner.hpp
@@ -7,21 +7,19 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// This file contains common utilities for defining device kernel wrappers to
-/// math functions.
+/// This file contains the definition of the runKernelBody, a template helper
+/// that executes the per-thread logic of a math function's kernel wrapper.
///
//===----------------------------------------------------------------------===//
-#ifndef CONFORMANCE_DEVICE_CODE_COMMON_HPP
-#define CONFORMANCE_DEVICE_CODE_COMMON_HPP
+#ifndef CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP
+#define CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP
#include <gpuintrin.h>
#include <stddef.h>
#include <stdint.h>
-namespace common {
-
-typedef _Float16 float16;
+namespace kernels {
template <auto Func, typename OutType, typename... InTypes>
void runKernelBody(size_t NumElements, OutType *Out, const InTypes *...Ins) {
@@ -32,6 +30,6 @@ void runKernelBody(size_t NumElements, OutType *Out, const InTypes *...Ins) {
Out[Index] = Func(Ins[Index]...);
}
}
-} // namespace common
+} // namespace kernels
-#endif // CONFORMANCE_DEVICE_CODE_COMMON_HPP
+#endif // CONFORMANCE_DEVICE_CODE_KERNELRUNNER_HPP
diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.cpp b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
index f137ba3d23752..8869d87017486 100644
--- a/offload/unittests/Conformance/device_code/LLVMLibm.cpp
+++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
@@ -12,13 +12,14 @@
///
//===----------------------------------------------------------------------===//
-#include "Conformance/device_code/Common.hpp"
+#include "Conformance/device_code/DeviceAPIs.hpp"
+#include "Conformance/device_code/KernelRunner.hpp"
#include <gpuintrin.h>
#include <math.h>
#include <stddef.h>
-using namespace common;
+using namespace kernels;
//===----------------------------------------------------------------------===//
// Helpers
>From 9c33ceafd45ff1d3ae0270709cbfd88bd0eb7254 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Wed, 6 Aug 2025 22:20:44 -0300
Subject: [PATCH 3/3] Encapsulate vendor math lib logic in helper
---
offload/unittests/CMakeLists.txt | 31 +++++++++++++++++--
.../Conformance/device_code/CMakeLists.txt | 24 ++------------
2 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt
index a0d5c01263056..e53042d0ba943 100644
--- a/offload/unittests/CMakeLists.txt
+++ b/offload/unittests/CMakeLists.txt
@@ -18,6 +18,9 @@ endif ()
set(OFFLOAD_UNITTESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
function(add_offload_test_device_code test_filename test_name)
+ cmake_parse_arguments(
+ "ARGS" "WITH_DEVICE_MATH_LIBS" "" "" ${ARGN})
+
set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
@@ -37,13 +40,25 @@ function(add_offload_test_device_code test_filename test_name)
endif()
if(nvptx_arch AND CUDAToolkit_FOUND)
+ set(nvptx_compile_flags ${ARGS_UNPARSED_ARGUMENTS})
+
+ if(ARGS_WITH_DEVICE_MATH_LIBS)
+ file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc")
+ if(libdevice_paths)
+ list(GET libdevice_paths 0 libdevice_path)
+ list(APPEND nvptx_compile_flags "-Xclang" "-mlink-builtin-bitcode")
+ list(APPEND nvptx_compile_flags "-Xclang" "${libdevice_path}")
+ list(APPEND nvptx_compile_flags "-DCUDA_MATH_FOUND=1")
+ endif()
+ endif()
+
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
add_custom_command(
OUTPUT ${output_file}
COMMAND ${CMAKE_CXX_COMPILER}
-I${OFFLOAD_UNITTESTS_DIR}
--target=nvptx64-nvidia-cuda -march=${nvptx_arch}
- -nogpulib --cuda-path=${cuda_path} -flto ${ARGN}
+ -nogpulib --cuda-path=${cuda_path} -flto ${nvptx_compile_flags}
${SRC_PATH} -o ${output_file}
DEPENDS ${SRC_PATH}
)
@@ -62,13 +77,25 @@ function(add_offload_test_device_code test_filename test_name)
endif()
if(amdgpu_arch)
+ set(amdgpu_compile_flags ${ARGS_UNPARSED_ARGUMENTS})
+
+ if(ARGS_WITH_DEVICE_MATH_LIBS)
+ find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+ if(AMDDeviceLibs_FOUND)
+ get_target_property(ocml_path ocml IMPORTED_LOCATION)
+ list(APPEND amdgpu_compile_flags "-Xclang" "-mlink-builtin-bitcode")
+ list(APPEND amdgpu_compile_flags "-Xclang" "${ocml_path}")
+ list(APPEND amdgpu_compile_flags "-DHIP_MATH_FOUND=1")
+ endif()
+ endif()
+
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
add_custom_command(
OUTPUT ${output_file}
COMMAND ${CMAKE_CXX_COMPILER}
-I${OFFLOAD_UNITTESTS_DIR}
--target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
- -nogpulib -flto ${ARGN} ${SRC_PATH} -o ${output_file}
+ -nogpulib -flto ${amdgpu_compile_flags} ${SRC_PATH} -o ${output_file}
DEPENDS ${SRC_PATH}
)
add_custom_target(${test_name}.amdgpu DEPENDS ${output_file})
diff --git a/offload/unittests/Conformance/device_code/CMakeLists.txt b/offload/unittests/Conformance/device_code/CMakeLists.txt
index 992f54c0c2376..a0c5369f24ae1 100644
--- a/offload/unittests/Conformance/device_code/CMakeLists.txt
+++ b/offload/unittests/Conformance/device_code/CMakeLists.txt
@@ -1,25 +1,5 @@
-set(cuda_math_flags "")
-find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
- file(GLOB libdevice_paths "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.*.bc")
- list(GET libdevice_paths 0 libdevice_path)
-
- if (EXISTS ${libdevice_path})
- list(APPEND cuda_math_flags "-Xclang" "-mlink-builtin-bitcode" "-Xclang" "${libdevice_path}")
- list(APPEND cuda_math_flags "-DCUDA_MATH_FOUND=1")
- endif()
-endif()
-
-set(hip_math_flags "")
-find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(AMDDeviceLibs_FOUND)
- get_target_property(ocml_path ocml IMPORTED_LOCATION)
- list(APPEND hip_math_flags "-Xclang" "-mlink-builtin-bitcode" "-Xclang" "${ocml_path}")
- list(APPEND hip_math_flags "-DHIP_MATH_FOUND=1")
-endif()
-
-add_offload_test_device_code(CUDAMath.cpp cuda-math -O3 -stdlib -fno-builtin ${cuda_math_flags})
-add_offload_test_device_code(HIPMath.cpp hip-math -O3 -stdlib -fno-builtin ${hip_math_flags})
+add_offload_test_device_code(CUDAMath.cpp cuda-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
+add_offload_test_device_code(HIPMath.cpp hip-math WITH_DEVICE_MATH_LIBS -O3 -stdlib -fno-builtin)
add_offload_test_device_code(LLVMLibm.cpp llvm-libm -O3 -stdlib -fno-builtin)
add_custom_target(conformance_device_binaries DEPENDS
More information about the llvm-commits
mailing list