[llvm] [Offload][UnitTests] Build device code as C++ (PR #151714)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 1 08:39:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Leandro Lacerda (leandrolcampos)
<details>
<summary>Changes</summary>
This commit refactors the `add_offload_test_device_code` CMake function to compile device code using the C++ compiler (`CMAKE_CXX_COMPILER`) instead of the C compiler.
This change enables the use of C++ features, such as templates, within device-side test kernels. This will allow for more advanced and reusable kernel wrappers, reducing boilerplate code in the conformance test suite.
As part of this change:
- All `.c` files for device code in `unittests/` have been renamed to `.cpp`.
- Kernel definitions are now wrapped in `extern "C"` to ensure C linkage and prevent name mangling.
This change affects the `OffloadAPI` and `Conformance` test suites.
cc @<!-- -->callumfare @<!-- -->RossBrunton @<!-- -->jhuber6
---
Full diff: https://github.com/llvm/llvm-project/pull/151714.diff
15 Files Affected:
- (modified) offload/unittests/CMakeLists.txt (+2-2)
- (modified) offload/unittests/Conformance/device_code/CMakeLists.txt (+1-1)
- (renamed) offload/unittests/Conformance/device_code/LLVMLibm.cpp (+3)
- (modified) offload/unittests/OffloadAPI/device_code/CMakeLists.txt (+10-10)
- (renamed) offload/unittests/OffloadAPI/device_code/bar.cpp (+1-1)
- (renamed) offload/unittests/OffloadAPI/device_code/foo.cpp (+1-1)
- (renamed) offload/unittests/OffloadAPI/device_code/global.cpp (+3)
- (renamed) offload/unittests/OffloadAPI/device_code/global_ctor.cpp (+3)
- (renamed) offload/unittests/OffloadAPI/device_code/global_dtor.cpp (+3)
- (renamed) offload/unittests/OffloadAPI/device_code/localmem.cpp (+1-1)
- (renamed) offload/unittests/OffloadAPI/device_code/localmem_reduction.cpp (+1-1)
- (renamed) offload/unittests/OffloadAPI/device_code/localmem_static.cpp (+1-1)
- (removed) offload/unittests/OffloadAPI/device_code/noargs.c (-3)
- (added) offload/unittests/OffloadAPI/device_code/noargs.cpp (+3)
- (renamed) offload/unittests/OffloadAPI/device_code/sequence.cpp (+1-1)
``````````diff
diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt
index 6d165ffd4c53a..1571658c7006b 100644
--- a/offload/unittests/CMakeLists.txt
+++ b/offload/unittests/CMakeLists.txt
@@ -38,7 +38,7 @@ function(add_offload_test_device_code test_filename test_name)
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
add_custom_command(
OUTPUT ${output_file}
- COMMAND ${CMAKE_C_COMPILER}
+ COMMAND ${CMAKE_CXX_COMPILER}
--target=nvptx64-nvidia-cuda -march=${nvptx_arch}
-nogpulib --cuda-path=${CUDA_ROOT} -flto ${ARGN}
${SRC_PATH} -o ${output_file}
@@ -62,7 +62,7 @@ function(add_offload_test_device_code test_filename test_name)
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
add_custom_command(
OUTPUT ${output_file}
- COMMAND ${CMAKE_C_COMPILER}
+ COMMAND ${CMAKE_CXX_COMPILER}
--target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
-nogpulib -flto ${ARGN} ${SRC_PATH} -o ${output_file}
DEPENDS ${SRC_PATH}
diff --git a/offload/unittests/Conformance/device_code/CMakeLists.txt b/offload/unittests/Conformance/device_code/CMakeLists.txt
index 18f54b8dc5252..9cbd11096292c 100644
--- a/offload/unittests/Conformance/device_code/CMakeLists.txt
+++ b/offload/unittests/Conformance/device_code/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_offload_test_device_code(LLVMLibm.c llvm-libm -stdlib -fno-builtin)
+add_offload_test_device_code(LLVMLibm.cpp llvm-libm -stdlib -fno-builtin)
add_custom_target(conformance_device_binaries DEPENDS llvm-libm.bin)
set(OFFLOAD_CONFORMANCE_DEVICE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/Conformance/device_code/LLVMLibm.c b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
similarity index 97%
rename from offload/unittests/Conformance/device_code/LLVMLibm.c
rename to offload/unittests/Conformance/device_code/LLVMLibm.cpp
index fe5196a539455..2c3d9bc5bf5cf 100644
--- a/offload/unittests/Conformance/device_code/LLVMLibm.c
+++ b/offload/unittests/Conformance/device_code/LLVMLibm.cpp
@@ -19,6 +19,8 @@
typedef _Float16 float16;
+extern "C" {
+
__gpu_kernel void hypotf16Kernel(const float16 *X, float16 *Y, float16 *Out,
size_t NumElements) {
uint32_t Index =
@@ -35,3 +37,4 @@ __gpu_kernel void logfKernel(const float *X, float *Out, size_t NumElements) {
if (Index < NumElements)
Out[Index] = logf(X[Index]);
}
+} // extern "C"
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 0e4695ee9969f..50e430597e646 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -1,14 +1,14 @@
-add_offload_test_device_code(foo.c foo)
-add_offload_test_device_code(bar.c bar)
+add_offload_test_device_code(foo.cpp foo)
+add_offload_test_device_code(bar.cpp bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
-add_offload_test_device_code(noargs.c noargs -O3)
-add_offload_test_device_code(localmem.c localmem)
-add_offload_test_device_code(localmem_reduction.c localmem_reduction)
-add_offload_test_device_code(localmem_static.c localmem_static)
-add_offload_test_device_code(global.c global)
-add_offload_test_device_code(global_ctor.c global_ctor)
-add_offload_test_device_code(global_dtor.c global_dtor)
-add_offload_test_device_code(sequence.c sequence)
+add_offload_test_device_code(noargs.cpp noargs -O3)
+add_offload_test_device_code(localmem.cpp localmem)
+add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
+add_offload_test_device_code(localmem_static.cpp localmem_static)
+add_offload_test_device_code(global.cpp global)
+add_offload_test_device_code(global_ctor.cpp global_ctor)
+add_offload_test_device_code(global_dtor.cpp global_dtor)
+add_offload_test_device_code(sequence.cpp sequence)
add_custom_target(offload_device_binaries DEPENDS
foo.bin
diff --git a/offload/unittests/OffloadAPI/device_code/bar.c b/offload/unittests/OffloadAPI/device_code/bar.cpp
similarity index 63%
rename from offload/unittests/OffloadAPI/device_code/bar.c
rename to offload/unittests/OffloadAPI/device_code/bar.cpp
index 786aa2f5d61e7..b5191671f293f 100644
--- a/offload/unittests/OffloadAPI/device_code/bar.c
+++ b/offload/unittests/OffloadAPI/device_code/bar.cpp
@@ -1,5 +1,5 @@
#include <gpuintrin.h>
-__gpu_kernel void foo(int *out) {
+extern "C" __gpu_kernel void foo(int *out) {
out[__gpu_thread_id(0)] = __gpu_thread_id(0) + 1;
}
diff --git a/offload/unittests/OffloadAPI/device_code/foo.c b/offload/unittests/OffloadAPI/device_code/foo.cpp
similarity index 65%
rename from offload/unittests/OffloadAPI/device_code/foo.c
rename to offload/unittests/OffloadAPI/device_code/foo.cpp
index 83cdc53cddd8d..cdc20015fc3e2 100644
--- a/offload/unittests/OffloadAPI/device_code/foo.c
+++ b/offload/unittests/OffloadAPI/device_code/foo.cpp
@@ -1,6 +1,6 @@
#include <gpuintrin.h>
#include <stdint.h>
-__gpu_kernel void foo(uint32_t *out) {
+extern "C" __gpu_kernel void foo(uint32_t *out) {
out[__gpu_thread_id(0)] = __gpu_thread_id(0);
}
diff --git a/offload/unittests/OffloadAPI/device_code/global.c b/offload/unittests/OffloadAPI/device_code/global.cpp
similarity index 92%
rename from offload/unittests/OffloadAPI/device_code/global.c
rename to offload/unittests/OffloadAPI/device_code/global.cpp
index 9f27f9424324f..dada16c87766c 100644
--- a/offload/unittests/OffloadAPI/device_code/global.c
+++ b/offload/unittests/OffloadAPI/device_code/global.cpp
@@ -1,6 +1,8 @@
#include <gpuintrin.h>
#include <stdint.h>
+extern "C" {
+
[[gnu::visibility("default")]]
uint32_t global[64];
@@ -13,3 +15,4 @@ __gpu_kernel void read(uint32_t *out) {
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
global[__gpu_thread_id(0)];
}
+} // extern "C"
diff --git a/offload/unittests/OffloadAPI/device_code/global_ctor.c b/offload/unittests/OffloadAPI/device_code/global_ctor.cpp
similarity index 95%
rename from offload/unittests/OffloadAPI/device_code/global_ctor.c
rename to offload/unittests/OffloadAPI/device_code/global_ctor.cpp
index 27e2d71d7566e..a14f1d59bf950 100644
--- a/offload/unittests/OffloadAPI/device_code/global_ctor.c
+++ b/offload/unittests/OffloadAPI/device_code/global_ctor.cpp
@@ -1,6 +1,8 @@
#include <gpuintrin.h>
#include <stdint.h>
+extern "C" {
+
uint32_t global[64];
[[gnu::constructor(202)]] void ctorc() {
@@ -23,3 +25,4 @@ __gpu_kernel void global_ctor(uint32_t *out) {
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
global[__gpu_thread_id(0)];
}
+} // extern "C"
diff --git a/offload/unittests/OffloadAPI/device_code/global_dtor.c b/offload/unittests/OffloadAPI/device_code/global_dtor.cpp
similarity index 87%
rename from offload/unittests/OffloadAPI/device_code/global_dtor.c
rename to offload/unittests/OffloadAPI/device_code/global_dtor.cpp
index cadcc19cc296b..6b1f941342b40 100644
--- a/offload/unittests/OffloadAPI/device_code/global_dtor.c
+++ b/offload/unittests/OffloadAPI/device_code/global_dtor.cpp
@@ -1,6 +1,8 @@
#include <gpuintrin.h>
#include <stdint.h>
+extern "C" {
+
uint32_t global[64];
[[gnu::destructor]] void dtor() {
@@ -11,3 +13,4 @@ uint32_t global[64];
__gpu_kernel void global_dtor() {
// no-op
}
+} // extern "C"
diff --git a/offload/unittests/OffloadAPI/device_code/localmem.c b/offload/unittests/OffloadAPI/device_code/localmem.cpp
similarity index 84%
rename from offload/unittests/OffloadAPI/device_code/localmem.c
rename to offload/unittests/OffloadAPI/device_code/localmem.cpp
index d70847900bc43..9542e2cb1d648 100644
--- a/offload/unittests/OffloadAPI/device_code/localmem.c
+++ b/offload/unittests/OffloadAPI/device_code/localmem.cpp
@@ -3,7 +3,7 @@
extern __gpu_local uint32_t shared_mem[];
-__gpu_kernel void localmem(uint32_t *out) {
+extern "C" __gpu_kernel void localmem(uint32_t *out) {
shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
shared_mem[__gpu_thread_id(0)] *= 2;
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_reduction.c b/offload/unittests/OffloadAPI/device_code/localmem_reduction.cpp
similarity index 83%
rename from offload/unittests/OffloadAPI/device_code/localmem_reduction.c
rename to offload/unittests/OffloadAPI/device_code/localmem_reduction.cpp
index 8a9a46cfb6a11..2c0a3e80b16e7 100644
--- a/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
+++ b/offload/unittests/OffloadAPI/device_code/localmem_reduction.cpp
@@ -3,7 +3,7 @@
extern __gpu_local uint32_t shared_mem[];
-__gpu_kernel void localmem_reduction(uint32_t *out) {
+extern "C" __gpu_kernel void localmem_reduction(uint32_t *out) {
shared_mem[__gpu_thread_id(0)] = 2;
__gpu_sync_threads();
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_static.c b/offload/unittests/OffloadAPI/device_code/localmem_static.cpp
similarity index 85%
rename from offload/unittests/OffloadAPI/device_code/localmem_static.c
rename to offload/unittests/OffloadAPI/device_code/localmem_static.cpp
index 928b48422a0d6..a8dd95473742c 100644
--- a/offload/unittests/OffloadAPI/device_code/localmem_static.c
+++ b/offload/unittests/OffloadAPI/device_code/localmem_static.cpp
@@ -4,7 +4,7 @@
[[clang::loader_uninitialized]]
__gpu_local uint32_t shared_mem[64];
-__gpu_kernel void localmem_static(uint32_t *out) {
+extern "C" __gpu_kernel void localmem_static(uint32_t *out) {
shared_mem[__gpu_thread_id(0)] = 2;
__gpu_sync_threads();
diff --git a/offload/unittests/OffloadAPI/device_code/noargs.c b/offload/unittests/OffloadAPI/device_code/noargs.c
deleted file mode 100644
index 36e609aa26a09..0000000000000
--- a/offload/unittests/OffloadAPI/device_code/noargs.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <gpuintrin.h>
-
-__gpu_kernel void noargs() { (void)0; }
diff --git a/offload/unittests/OffloadAPI/device_code/noargs.cpp b/offload/unittests/OffloadAPI/device_code/noargs.cpp
new file mode 100644
index 0000000000000..58f989c714fed
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/noargs.cpp
@@ -0,0 +1,3 @@
+#include <gpuintrin.h>
+
+extern "C" __gpu_kernel void noargs() { (void)0; }
diff --git a/offload/unittests/OffloadAPI/device_code/sequence.c b/offload/unittests/OffloadAPI/device_code/sequence.cpp
similarity index 71%
rename from offload/unittests/OffloadAPI/device_code/sequence.c
rename to offload/unittests/OffloadAPI/device_code/sequence.cpp
index 7662f2d817496..07f92944346f5 100644
--- a/offload/unittests/OffloadAPI/device_code/sequence.c
+++ b/offload/unittests/OffloadAPI/device_code/sequence.cpp
@@ -1,7 +1,7 @@
#include <gpuintrin.h>
#include <stdint.h>
-__gpu_kernel void sequence(uint32_t idx, uint32_t *inout) {
+extern "C" __gpu_kernel void sequence(uint32_t idx, uint32_t *inout) {
if (idx == 0)
inout[idx] = 0;
else if (idx == 1)
``````````
</details>
https://github.com/llvm/llvm-project/pull/151714
More information about the llvm-commits
mailing list