[llvm] [Offload] Tests for global memory and constructors (PR #147537)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 9 04:05:20 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/147537
>From d143d0c9d3fcba17a8374c10d8b839d24758c768 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 8 Jul 2025 15:22:47 +0100
Subject: [PATCH 1/2] [Offload] Tests for global memory and constructors
Adds two "launch kernel" tests for lib offload, one testing that
global memory works and persists between different kernels, and one
verifying that `__attribute__((constructor))` works correctly.
Since we now have tests that contain multiple kernels in the same
binary, the test framework has been updated a bit.
---
.../OffloadAPI/device_code/CMakeLists.txt | 4 +
.../unittests/OffloadAPI/device_code/global.c | 14 +++
.../OffloadAPI/device_code/global_ctor.c | 16 ++++
.../OffloadAPI/kernel/olLaunchKernel.cpp | 85 +++++++++++++++++--
4 files changed, 113 insertions(+), 6 deletions(-)
create mode 100644 offload/unittests/OffloadAPI/device_code/global.c
create mode 100644 offload/unittests/OffloadAPI/device_code/global_ctor.c
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index d867e2aae1316..a891b3229f7e8 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -5,6 +5,8 @@ add_offload_test_device_code(noargs.c noargs -O3)
add_offload_test_device_code(localmem.c localmem)
add_offload_test_device_code(localmem_reduction.c localmem_reduction)
add_offload_test_device_code(localmem_static.c localmem_static)
+add_offload_test_device_code(global.c global)
+add_offload_test_device_code(global_ctor.c global_ctor)
add_custom_target(offload_device_binaries DEPENDS
foo.bin
@@ -13,5 +15,7 @@ add_custom_target(offload_device_binaries DEPENDS
localmem.bin
localmem_reduction.bin
localmem_static.bin
+ global.bin
+ global_ctor.bin
)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/global.c b/offload/unittests/OffloadAPI/device_code/global.c
new file mode 100644
index 0000000000000..b30e406fb98c7
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/global.c
@@ -0,0 +1,14 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+uint32_t global[64];
+
+__gpu_kernel void write() {
+ global[__gpu_thread_id(0)] = __gpu_thread_id(0);
+ global[__gpu_thread_id(0)] *= 2;
+}
+
+__gpu_kernel void read(uint32_t *out) {
+ out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+ global[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/device_code/global_ctor.c b/offload/unittests/OffloadAPI/device_code/global_ctor.c
new file mode 100644
index 0000000000000..5846571fa43bc
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/global_ctor.c
@@ -0,0 +1,16 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+[[clang::loader_uninitialized]]
+uint32_t global[64];
+
+__attribute__((constructor)) void ctor() {
+ for (unsigned I = 0; I < 64; I++)
+ global[I] = 100;
+}
+
+__gpu_kernel void global_ctor(uint32_t *out) {
+ global[__gpu_thread_id(0)] += __gpu_thread_id(0);
+ out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+ global[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index b444aed9a6bea..733461a500bdb 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -11,13 +11,13 @@
#include <gtest/gtest.h>
struct LaunchKernelTestBase : OffloadQueueTest {
- void SetUpKernel(const char *kernel) {
+ void SetUpProgram(const char *program) {
RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
- ASSERT_TRUE(TestEnvironment::loadDeviceBinary(kernel, Device, DeviceBin));
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
DeviceBin->getBufferSize(), &Program));
- ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
+
LaunchArgs.Dimensions = 1;
LaunchArgs.GroupSize = {64, 1, 1};
LaunchArgs.NumGroups = {1, 1, 1};
@@ -34,13 +34,21 @@ struct LaunchKernelTestBase : OffloadQueueTest {
std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
ol_program_handle_t Program = nullptr;
- ol_kernel_handle_t Kernel = nullptr;
ol_kernel_launch_size_args_t LaunchArgs{};
};
+struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
+ void SetUpKernel(const char *kernel) {
+ RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
+ ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
+ }
+
+ ol_kernel_handle_t Kernel = nullptr;
+};
+
#define KERNEL_TEST(NAME, KERNEL) \
- struct olLaunchKernel##NAME##Test : LaunchKernelTestBase { \
- void SetUp() override { LaunchKernelTestBase::SetUpKernel(#KERNEL); } \
+ struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase { \
+ void SetUp() override { SetUpKernel(#KERNEL); } \
}; \
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
@@ -49,6 +57,28 @@ KERNEL_TEST(NoArgs, noargs)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
KERNEL_TEST(LocalMemStatic, localmem_static)
+KERNEL_TEST(GlobalCtor, global_ctor)
+
+struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
+ void SetUpKernels(const char *program, std::vector<const char *> kernels) {
+ RETURN_ON_FATAL_FAILURE(SetUpProgram(program));
+
+ Kernels.resize(kernels.size());
+ size_t I = 0;
+ for (auto K : kernels)
+ ASSERT_SUCCESS(olGetKernel(Program, K, &Kernels[I++]));
+ }
+
+ std::vector<ol_kernel_handle_t> Kernels;
+};
+
+#define KERNEL_MULTI_TEST(NAME, PROGRAM, ...) \
+ struct olLaunchKernel##NAME##Test : LaunchMultipleKernelTestBase { \
+ void SetUp() override { SetUpKernels(#PROGRAM, {__VA_ARGS__}); } \
+ }; \
+ OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
+
+KERNEL_MULTI_TEST(Global, global, "write", "read")
TEST_P(olLaunchKernelFooTest, Success) {
void *Mem;
@@ -168,3 +198,46 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
ASSERT_SUCCESS(olMemFree(Mem));
}
+
+TEST_P(olLaunchKernelGlobalTest, Success) {
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0,
+ &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < 64; i++) {
+ ASSERT_EQ(Data[i], i * 2);
+ }
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchKernelGlobalCtorTest, Success) {
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < 64; i++) {
+ ASSERT_EQ(Data[i], i + 100);
+ }
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
>From 32e58bd2470c680de5b368e0eef990e529ff60e9 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Wed, 9 Jul 2025 12:04:57 +0100
Subject: [PATCH 2/2] Add ctor priority and dtor testing
---
.../OffloadAPI/device_code/CMakeLists.txt | 2 ++
.../OffloadAPI/device_code/global_ctor.c | 15 ++++++++++++---
.../OffloadAPI/device_code/global_dtor.c | 13 +++++++++++++
.../OffloadAPI/kernel/olLaunchKernel.cpp | 10 ++++++++++
4 files changed, 37 insertions(+), 3 deletions(-)
create mode 100644 offload/unittests/OffloadAPI/device_code/global_dtor.c
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index a891b3229f7e8..11c8ccbd6c7c5 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -7,6 +7,7 @@ add_offload_test_device_code(localmem_reduction.c localmem_reduction)
add_offload_test_device_code(localmem_static.c localmem_static)
add_offload_test_device_code(global.c global)
add_offload_test_device_code(global_ctor.c global_ctor)
+add_offload_test_device_code(global_dtor.c global_dtor)
add_custom_target(offload_device_binaries DEPENDS
foo.bin
@@ -17,5 +18,6 @@ add_custom_target(offload_device_binaries DEPENDS
localmem_static.bin
global.bin
global_ctor.bin
+ global_dtor.bin
)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/global_ctor.c b/offload/unittests/OffloadAPI/device_code/global_ctor.c
index 5846571fa43bc..27e2d71d7566e 100644
--- a/offload/unittests/OffloadAPI/device_code/global_ctor.c
+++ b/offload/unittests/OffloadAPI/device_code/global_ctor.c
@@ -1,12 +1,21 @@
#include <gpuintrin.h>
#include <stdint.h>
-[[clang::loader_uninitialized]]
uint32_t global[64];
-__attribute__((constructor)) void ctor() {
+[[gnu::constructor(202)]] void ctorc() {
for (unsigned I = 0; I < 64; I++)
- global[I] = 100;
+ global[I] += 20;
+}
+
+[[gnu::constructor(200)]] void ctora() {
+ for (unsigned I = 0; I < 64; I++)
+ global[I] = 40;
+}
+
+[[gnu::constructor(201)]] void ctorb() {
+ for (unsigned I = 0; I < 64; I++)
+ global[I] *= 2;
}
__gpu_kernel void global_ctor(uint32_t *out) {
diff --git a/offload/unittests/OffloadAPI/device_code/global_dtor.c b/offload/unittests/OffloadAPI/device_code/global_dtor.c
new file mode 100644
index 0000000000000..cadcc19cc296b
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/global_dtor.c
@@ -0,0 +1,13 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+uint32_t global[64];
+
+[[gnu::destructor]] void dtor() {
+ for (unsigned I = 0; I < 64; I++)
+ global[I] = 1;
+}
+
+__gpu_kernel void global_dtor() {
+ // no-op
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 733461a500bdb..41d5c79c42ded 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -58,6 +58,7 @@ KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
KERNEL_TEST(LocalMemStatic, localmem_static)
KERNEL_TEST(GlobalCtor, global_ctor)
+KERNEL_TEST(GlobalDtor, global_dtor)
struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
void SetUpKernels(const char *program, std::vector<const char *> kernels) {
@@ -241,3 +242,12 @@ TEST_P(olLaunchKernelGlobalCtorTest, Success) {
ASSERT_SUCCESS(olMemFree(Mem));
}
+
+TEST_P(olLaunchKernelGlobalDtorTest, Success) {
+ // TODO: We can't inspect the result of a destructor yet, once we
+ // find/implement a way, update this test. For now we just check that nothing
+ // crashes
+ ASSERT_SUCCESS(
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+}
More information about the llvm-commits
mailing list