[llvm] [Offload] Add liboffload unit tests for shared/local memory (PR #147040)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 7 03:32:18 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/147040
>From c878ebf678007122b144cc9a1e695f2a88f85111 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 4 Jul 2025 11:59:40 +0100
Subject: [PATCH 1/3] [Offload] Add liboffload unit tests for shared/local
memory
---
.../OffloadAPI/device_code/CMakeLists.txt | 4 +-
.../OffloadAPI/device_code/localmem.c | 11 ++++
.../device_code/localmem_reduction.c | 16 +++++
.../OffloadAPI/kernel/olLaunchKernel.cpp | 63 +++++++++++++++++++
4 files changed, 93 insertions(+), 1 deletion(-)
create mode 100644 offload/unittests/OffloadAPI/device_code/localmem.c
create mode 100644 offload/unittests/OffloadAPI/device_code/localmem_reduction.c
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 132c7a7c51fb8..acc57f3fa3473 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,8 @@ add_offload_test_device_code(foo.c foo)
add_offload_test_device_code(bar.c bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.c noargs -O3)
+add_offload_test_device_code(localmem.c localmem)
+add_offload_test_device_code(localmem_reduction.c localmem_reduction)
-add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
+add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/localmem.c b/offload/unittests/OffloadAPI/device_code/localmem.c
new file mode 100644
index 0000000000000..d70847900bc43
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem.c
@@ -0,0 +1,11 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem(uint32_t *out) {
+ shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
+ shared_mem[__gpu_thread_id(0)] *= 2;
+ out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+ shared_mem[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_reduction.c b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
new file mode 100644
index 0000000000000..8a9a46cfb6a11
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
@@ -0,0 +1,16 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem_reduction(uint32_t *out) {
+ shared_mem[__gpu_thread_id(0)] = 2;
+
+ __gpu_sync_threads();
+
+ if (__gpu_thread_id(0) == 0) {
+ out[__gpu_block_id(0)] = 0;
+ for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
+ out[__gpu_block_id(0)] += shared_mem[i];
+ }
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index a3da334afccac..639a790de8b4a 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -52,6 +52,21 @@ struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
+struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
+ }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);
+
+struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(
+ LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
+ }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);
+
TEST_P(olLaunchKernelTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
@@ -99,3 +114,51 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
ASSERT_SUCCESS(olMemFree(Mem));
}
+
+TEST_P(olLaunchKernelLocalMemTest, Success) {
+ LaunchArgs.NumGroups.x = 4;
+ LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
+ sizeof(uint32_t),
+ &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
+ ASSERT_EQ(Data[i], (i % 64) * 2);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
+ LaunchArgs.NumGroups.x = 4;
+ LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
+ ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
>From ea2183ed8c61273ef077b01aa0b047cc82e97873 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 7 Jul 2025 11:30:14 +0100
Subject: [PATCH 2/3] Add static test
---
.../OffloadAPI/device_code/CMakeLists.txt | 10 ++-
.../OffloadAPI/kernel/olLaunchKernel.cpp | 68 ++++++++++---------
2 files changed, 46 insertions(+), 32 deletions(-)
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index acc57f3fa3473..d867e2aae1316 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -4,6 +4,14 @@ add_offload_test_device_code(bar.c bar)
add_offload_test_device_code(noargs.c noargs -O3)
add_offload_test_device_code(localmem.c localmem)
add_offload_test_device_code(localmem_reduction.c localmem_reduction)
+add_offload_test_device_code(localmem_static.c localmem_static)
-add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
+add_custom_target(offload_device_binaries DEPENDS
+ foo.bin
+ bar.bin
+ noargs.bin
+ localmem.bin
+ localmem_reduction.bin
+ localmem_static.bin
+)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 639a790de8b4a..b444aed9a6bea 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -38,36 +38,19 @@ struct LaunchKernelTestBase : OffloadQueueTest {
ol_kernel_launch_size_args_t LaunchArgs{};
};
-struct olLaunchKernelTest : LaunchKernelTestBase {
- void SetUp() override {
- RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("foo"));
- }
-};
-OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelTest);
-
-struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
- void SetUp() override {
- RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("noargs"));
- }
-};
-OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
-
-struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
- void SetUp() override {
- RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
- }
-};
-OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);
-
-struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
- void SetUp() override {
- RETURN_ON_FATAL_FAILURE(
- LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
- }
-};
-OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);
-
-TEST_P(olLaunchKernelTest, Success) {
+#define KERNEL_TEST(NAME, KERNEL) \
+ struct olLaunchKernel##NAME##Test : LaunchKernelTestBase { \
+ void SetUp() override { LaunchKernelTestBase::SetUpKernel(#KERNEL); } \
+ }; \
+ OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
+
+KERNEL_TEST(Foo, foo)
+KERNEL_TEST(NoArgs, noargs)
+KERNEL_TEST(LocalMem, localmem)
+KERNEL_TEST(LocalMemReduction, localmem_reduction)
+KERNEL_TEST(LocalMemStatic, localmem_static)
+
+TEST_P(olLaunchKernelFooTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
@@ -95,7 +78,7 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(olWaitQueue(Queue));
}
-TEST_P(olLaunchKernelTest, SuccessSynchronous) {
+TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
@@ -162,3 +145,26 @@ TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
ASSERT_SUCCESS(olMemFree(Mem));
}
+
+TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
+ LaunchArgs.NumGroups.x = 4;
+ LaunchArgs.DynSharedMemory = 0;
+
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
+ ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
>From 82e76b9736b40cb000da51157fb55539d194e643 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 7 Jul 2025 11:31:58 +0100
Subject: [PATCH 3/3] Actually add the test
---
.../OffloadAPI/device_code/localmem_static.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
create mode 100644 offload/unittests/OffloadAPI/device_code/localmem_static.c
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_static.c b/offload/unittests/OffloadAPI/device_code/localmem_static.c
new file mode 100644
index 0000000000000..928b48422a0d6
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem_static.c
@@ -0,0 +1,17 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+[[clang::loader_uninitialized]]
+__gpu_local uint32_t shared_mem[64];
+
+__gpu_kernel void localmem_static(uint32_t *out) {
+ shared_mem[__gpu_thread_id(0)] = 2;
+
+ __gpu_sync_threads();
+
+ if (__gpu_thread_id(0) == 0) {
+ out[__gpu_block_id(0)] = 0;
+ for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
+ out[__gpu_block_id(0)] += shared_mem[i];
+ }
+}
More information about the llvm-commits
mailing list