[llvm] [Offload] Tests for global memory and constructors (PR #147537)

Tue Jul 8 07:25:34 PDT 2025

https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/147537

Adds two "launch kernel" tests for lib offload, one testing that
global memory works and persists between different kernels, and one
verifying that `__attribute__((constructor))` works correctly.

Since we now have tests that contain multiple kernels in the same
binary, the test framework has been updated a bit.


>From d143d0c9d3fcba17a8374c10d8b839d24758c768 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 8 Jul 2025 15:22:47 +0100
Subject: [PATCH] [Offload] Tests for global memory and constructors

Adds two "launch kernel" tests for lib offload, one testing that
global memory works and persists between different kernels, and one
verifying that `__attribute__((constructor))` works correctly.

Since we now have tests that contain multiple kernels in the same
binary, the test framework has been updated a bit.
---
 .../OffloadAPI/device_code/CMakeLists.txt     |  4 +
 .../unittests/OffloadAPI/device_code/global.c | 14 +++
 .../OffloadAPI/device_code/global_ctor.c      | 16 ++++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      | 85 +++++++++++++++++--
 4 files changed, 113 insertions(+), 6 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/device_code/global.c
 create mode 100644 offload/unittests/OffloadAPI/device_code/global_ctor.c

diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index d867e2aae1316..a891b3229f7e8 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -5,6 +5,8 @@ add_offload_test_device_code(noargs.c noargs -O3)
 add_offload_test_device_code(localmem.c localmem)
 add_offload_test_device_code(localmem_reduction.c localmem_reduction)
 add_offload_test_device_code(localmem_static.c localmem_static)
+add_offload_test_device_code(global.c global)
+add_offload_test_device_code(global_ctor.c global_ctor)
 
 add_custom_target(offload_device_binaries DEPENDS
     foo.bin
@@ -13,5 +15,7 @@ add_custom_target(offload_device_binaries DEPENDS
     localmem.bin
     localmem_reduction.bin
     localmem_static.bin
+    global.bin
+    global_ctor.bin
 )
 set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/global.c b/offload/unittests/OffloadAPI/device_code/global.c
new file mode 100644
index 0000000000000..b30e406fb98c7
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/global.c
@@ -0,0 +1,14 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+uint32_t global[64];
+
+__gpu_kernel void write() {
+  global[__gpu_thread_id(0)] = __gpu_thread_id(0);
+  global[__gpu_thread_id(0)] *= 2;
+}
+
+__gpu_kernel void read(uint32_t *out) {
+  out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+      global[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/device_code/global_ctor.c b/offload/unittests/OffloadAPI/device_code/global_ctor.c
new file mode 100644
index 0000000000000..5846571fa43bc
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/global_ctor.c
@@ -0,0 +1,16 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+[[clang::loader_uninitialized]]
+uint32_t global[64];
+
+__attribute__((constructor)) void ctor() {
+  for (unsigned I = 0; I < 64; I++)
+    global[I] = 100;
+}
+
+__gpu_kernel void global_ctor(uint32_t *out) {
+  global[__gpu_thread_id(0)] += __gpu_thread_id(0);
+  out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+      global[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index b444aed9a6bea..733461a500bdb 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -11,13 +11,13 @@
 #include <gtest/gtest.h>
 
 struct LaunchKernelTestBase : OffloadQueueTest {
-  void SetUpKernel(const char *kernel) {
+  void SetUpProgram(const char *program) {
     RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
-    ASSERT_TRUE(TestEnvironment::loadDeviceBinary(kernel, Device, DeviceBin));
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
     ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
     ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
                                    DeviceBin->getBufferSize(), &Program));
-    ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
+
     LaunchArgs.Dimensions = 1;
     LaunchArgs.GroupSize = {64, 1, 1};
     LaunchArgs.NumGroups = {1, 1, 1};
@@ -34,13 +34,21 @@ struct LaunchKernelTestBase : OffloadQueueTest {
 
   std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
   ol_program_handle_t Program = nullptr;
-  ol_kernel_handle_t Kernel = nullptr;
   ol_kernel_launch_size_args_t LaunchArgs{};
 };
 
+struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
+  void SetUpKernel(const char *kernel) {
+    RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
+    ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
+  }
+
+  ol_kernel_handle_t Kernel = nullptr;
+};
+
 #define KERNEL_TEST(NAME, KERNEL)                                              \
-  struct olLaunchKernel##NAME##Test : LaunchKernelTestBase {                   \
-    void SetUp() override { LaunchKernelTestBase::SetUpKernel(#KERNEL); }      \
+  struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase {             \
+    void SetUp() override { SetUpKernel(#KERNEL); }                            \
   };                                                                           \
   OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
 
@@ -49,6 +57,28 @@ KERNEL_TEST(NoArgs, noargs)
 KERNEL_TEST(LocalMem, localmem)
 KERNEL_TEST(LocalMemReduction, localmem_reduction)
 KERNEL_TEST(LocalMemStatic, localmem_static)
+KERNEL_TEST(GlobalCtor, global_ctor)
+
+struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
+  void SetUpKernels(const char *program, std::vector<const char *> kernels) {
+    RETURN_ON_FATAL_FAILURE(SetUpProgram(program));
+
+    Kernels.resize(kernels.size());
+    size_t I = 0;
+    for (auto K : kernels)
+      ASSERT_SUCCESS(olGetKernel(Program, K, &Kernels[I++]));
+  }
+
+  std::vector<ol_kernel_handle_t> Kernels;
+};
+
+#define KERNEL_MULTI_TEST(NAME, PROGRAM, ...)                                  \
+  struct olLaunchKernel##NAME##Test : LaunchMultipleKernelTestBase {           \
+    void SetUp() override { SetUpKernels(#PROGRAM, {__VA_ARGS__}); }           \
+  };                                                                           \
+  OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
+
+KERNEL_MULTI_TEST(Global, global, "write", "read")
 
 TEST_P(olLaunchKernelFooTest, Success) {
   void *Mem;
@@ -168,3 +198,46 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
 
   ASSERT_SUCCESS(olMemFree(Mem));
 }
+
+TEST_P(olLaunchKernelGlobalTest, Success) {
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+  struct {
+    void *Mem;
+  } Args{Mem};
+
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0,
+                                &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
+                                &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], i * 2);
+  }
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchKernelGlobalCtorTest, Success) {
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+  struct {
+    void *Mem;
+  } Args{Mem};
+
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < 64; i++) {
+    ASSERT_EQ(Data[i], i + 100);
+  }
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}