[llvm] [Offload] Allow setting null arguments in olLaunchKernel and fix amdgpu handling for empty kernel arguments (PR #141958)

Mon Jun 2 02:43:33 PDT 2025

https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/141958

>From ff6f89318ecc2c89fcabff01b65ef1e4a01d987a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 30 May 2025 13:46:37 -0500
Subject: [PATCH 1/4] [Offload][AMDGPU] Correctly handle variable implicit
 argument sizes

Summary:
The size of the implicit argument struct can vary depending on
optimizations, it is not always the size as listed by the full struct.
Additionally, the implicit arguments are always aligned on a pointer
boundary. This patch updates the handling to use the correctly aligned
offset and only initialize the members if they are contained in the
reported size.

Additionally, we modify the `alloc` and `free` routines to allow
`alloc(0)` and `free(nullptr)` as these are mandated by the C standard
and allow us to easily handle cases where the user calls a kernel with
no arguments.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 57 ++++++++++++-------
 .../amdgpu/utils/UtilitiesRTL.h               | 15 ++++-
 2 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 2733796611d9b..88f3b1ac5416c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -420,7 +420,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
     assert(PtrStorage && "Invalid pointer storage");
 
     *PtrStorage = MemoryManager->allocate(Size, nullptr);
-    if (*PtrStorage == nullptr)
+    if (Size && *PtrStorage == nullptr)
       return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
                            "failure to allocate from AMDGPU memory manager");
 
@@ -429,7 +429,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 
   /// Release an allocation to be reused.
   Error deallocate(void *Ptr) {
-    assert(Ptr && "Invalid pointer");
+    if (!Ptr)
+      return Plugin::success();
 
     if (MemoryManager->free(Ptr))
       return Plugin::error(ErrorCode::UNKNOWN,
@@ -3365,7 +3366,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   if (ArgsSize != LaunchParams.Size &&
-      ArgsSize != LaunchParams.Size + getImplicitArgsSize())
+      ArgsSize > LaunchParams.Size + getImplicitArgsSize())
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                          "mismatch of kernel arguments size");
 
@@ -3401,23 +3402,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
-  if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
-    ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
-        utils::advancePtr(AllArgs, LaunchParams.Size));
-
-    // Set the COV5+ implicit arguments to the appropriate values.
-    std::memset(ImplArgs, 0, getImplicitArgsSize());
-    ImplArgs->BlockCountX = NumBlocks[0];
-    ImplArgs->BlockCountY = NumBlocks[1];
-    ImplArgs->BlockCountZ = NumBlocks[2];
-    ImplArgs->GroupSizeX = NumThreads[0];
-    ImplArgs->GroupSizeY = NumThreads[1];
-    ImplArgs->GroupSizeZ = NumThreads[2];
-    ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
-                             ? 3
-                             : 1 + (NumBlocks[1] * NumThreads[1] != 1);
-    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+  uint64_t ImplArgsOffset = utils::roundUp(
+      LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
+  if (ArgsSize > ImplArgsOffset) {
+    hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
+        reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
+            utils::advancePtr(AllArgs, ImplArgsOffset));
+
+    // Set the COV5+ implicit arguments to the appropriate values if present.
+    uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
+    std::memset(ImplArgs, 0, ImplArgsSize);
+
+    using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
+                           NumBlocks[0]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
+                           NumBlocks[1]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
+                           NumBlocks[2]);
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
+                           NumThreads[0]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
+                           NumThreads[1]);
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
+                           NumThreads[2]);
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
+                           NumBlocks[2] * NumThreads[2] > 1
+                               ? 3
+                               : 1 + (NumBlocks[1] * NumThreads[1] != 1));
+
+    hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
+                           KernelArgs.DynCGroupMem);
   }
 
   // Push the kernel launch into the stream.
diff --git a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 609ead942dbb3..77c756e006029 100644
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -13,6 +13,7 @@
 #include <cstdint>
 
 #include "Shared/Debug.h"
+#include "Shared/Utils.h"
 #include "Utils/ELF.h"
 
 #include "omptarget.h"
@@ -26,7 +27,7 @@ namespace plugin {
 namespace hsa_utils {
 
 // The implicit arguments of COV5 AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
+struct alignas(alignof(void *)) AMDGPUImplicitArgsTy {
   uint32_t BlockCountX;
   uint32_t BlockCountY;
   uint32_t BlockCountZ;
@@ -60,6 +61,18 @@ inline Error readAMDGPUMetaDataFromImage(
   return Err;
 }
 
+/// Initializes the HSA implicit argument if the struct size permits it. This is
+/// necessary because optimizations can modify the size of the struct if
+/// portions of it are unused.
+template <typename MemberTy, typename T>
+void initImplArg(AMDGPUImplicitArgsTy *Base,
+                 MemberTy AMDGPUImplicitArgsTy::*Member, size_t AvailableSize,
+                 T Value) {
+  uint64_t Offset = utils::getPtrDiff(&(Base->*Member), Base);
+  if (Offset + sizeof(MemberTy) <= AvailableSize)
+    Base->*Member = static_cast<MemberTy>(Value);
+}
+
 } // namespace hsa_utils
 } // namespace plugin
 } // namespace target

>From f441061d5f4b6c9934bc9df05cc3dab50b71b823 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Thu, 29 May 2025 15:56:03 +0100
Subject: [PATCH 2/4] [Offload] Allow setting null arguments in olLaunchKernel

---
 offload/liboffload/API/Kernel.td              |  6 +++--
 .../liboffload/include/generated/OffloadAPI.h |  6 +++--
 .../include/generated/OffloadEntryPoints.inc  | 11 ++++----
 .../OffloadAPI/device_code/CMakeLists.txt     |  1 +
 .../unittests/OffloadAPI/device_code/noargs.c |  3 +++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      | 27 ++++++++++++++++---
 6 files changed, 41 insertions(+), 13 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/device_code/noargs.c

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..45e3d8112791c 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -43,19 +43,21 @@ def : Function {
     let name = "olLaunchKernel";
     let desc = "Enqueue a kernel launch with the specified size and parameters.";
     let details = [
-        "If a queue is not specified, kernel execution happens synchronously"
+        "If a queue is not specified, kernel execution happens synchronously",
+        "ArgumentsData may be set to NULL (to indicate no parameters)"
     ];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
         Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN>,
+        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
         Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
         Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
     let returns = [
         Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
+        Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
         Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
     ];
 }
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index a1d7540519e32..d918a51e8ffd7 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -692,6 +692,7 @@ typedef struct ol_kernel_launch_size_args_t {
 ///
 /// @details
 ///    - If a queue is not specified, kernel execution happens synchronously
+///    - ArgumentsData may be set to NULL (to indicate no parameters)
 ///
 /// @returns
 ///     - ::OL_RESULT_SUCCESS
@@ -699,13 +700,14 @@ typedef struct ol_kernel_launch_size_args_t {
 ///     - ::OL_ERRC_DEVICE_LOST
 ///     - ::OL_ERRC_INVALID_ARGUMENT
 ///         + `Queue == NULL && EventOut != NULL`
+///     - ::OL_ERRC_INVALID_ARGUMENT
+///         + `ArgumentsSize > 0 && ArgumentsData == NULL`
 ///     - ::OL_ERRC_INVALID_DEVICE
 ///         + If Queue is non-null but does not belong to Device
 ///     - ::OL_ERRC_INVALID_NULL_HANDLE
 ///         + `NULL == Device`
 ///         + `NULL == Kernel`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == ArgumentsData`
 ///         + `NULL == LaunchSizeArgs`
 OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
     // [in][optional] handle of the queue
@@ -714,7 +716,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
     ol_device_handle_t Device,
     // [in] handle of the kernel
     ol_kernel_handle_t Kernel,
-    // [in] pointer to the kernel argument struct
+    // [in][optional] pointer to the kernel argument struct
     const void *ArgumentsData,
     // [in] size of the kernel argument struct
     size_t ArgumentsSize,
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 9feebeea09ec3..d629e14a6bb55 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -838,6 +838,12 @@ olLaunchKernel_val(ol_queue_handle_t Queue, ol_device_handle_t Device,
           "validation failure: Queue == NULL && EventOut != NULL");
     }
 
+    if (ArgumentsSize > 0 && ArgumentsData == NULL) {
+      return createOffloadError(
+          error::ErrorCode::INVALID_ARGUMENT,
+          "validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
+    }
+
     if (NULL == Device) {
       return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
                                 "validation failure: NULL == Device");
@@ -848,11 +854,6 @@ olLaunchKernel_val(ol_queue_handle_t Queue, ol_device_handle_t Device,
                                 "validation failure: NULL == Kernel");
     }
 
-    if (NULL == ArgumentsData) {
-      return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
-                                "validation failure: NULL == ArgumentsData");
-    }
-
     if (NULL == LaunchSizeArgs) {
       return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
                                 "validation failure: NULL == LaunchSizeArgs");
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 5814943e4aaa9..91a9ca24786a3 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -61,6 +61,7 @@ endif()
 
 add_offload_test_device_code(foo.c foo)
 add_offload_test_device_code(bar.c bar)
+add_offload_test_device_code(noargs.c noargs)
 
 add_custom_target(OffloadUnitTestsDeviceBins DEPENDS ${BIN_PATHS})
 
diff --git a/offload/unittests/OffloadAPI/device_code/noargs.c b/offload/unittests/OffloadAPI/device_code/noargs.c
new file mode 100644
index 0000000000000..36e609aa26a09
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/noargs.c
@@ -0,0 +1,3 @@
+#include <gpuintrin.h>
+
+__gpu_kernel void noargs() { (void)0; }
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 20462e22fd73f..fe511a9decf1c 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -10,14 +10,14 @@
 #include <OffloadAPI.h>
 #include <gtest/gtest.h>
 
-struct olLaunchKernelTest : OffloadQueueTest {
-  void SetUp() override {
+struct LaunchKernelTestBase : OffloadQueueTest {
+  void SetUpKernel(const char *kernel) {
     RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
-    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary(kernel, Device, DeviceBin));
     ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
     ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
                                    DeviceBin->getBufferSize(), &Program));
-    ASSERT_SUCCESS(olGetKernel(Program, "foo", &Kernel));
+    ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
     LaunchArgs.Dimensions = 1;
     LaunchArgs.GroupSizeX = 64;
     LaunchArgs.GroupSizeY = 1;
@@ -43,8 +43,20 @@ struct olLaunchKernelTest : OffloadQueueTest {
   ol_kernel_launch_size_args_t LaunchArgs{};
 };
 
+struct olLaunchKernelTest : LaunchKernelTestBase {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("foo"));
+  }
+};
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelTest);
 
+struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("noargs"));
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
+
 TEST_P(olLaunchKernelTest, Success) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 64, &Mem));
@@ -65,6 +77,13 @@ TEST_P(olLaunchKernelTest, Success) {
   ASSERT_SUCCESS(olMemFree(Mem));
 }
 
+TEST_P(olLaunchKernelNoArgsTest, Success) {
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
+
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+}
+
 TEST_P(olLaunchKernelTest, SuccessSynchronous) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 64, &Mem));

>From 9542f35af8be39de9ec08f9f5c54483233378347 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 30 May 2025 10:14:16 +0100
Subject: [PATCH 3/4] Set opts for noargs.c

---
 offload/unittests/OffloadAPI/device_code/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 91a9ca24786a3..c2e4d0cb24e6e 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -7,6 +7,7 @@ macro(add_offload_test_device_code test_filename test_name)
         add_custom_command(OUTPUT ${BIN_PATH}
             COMMAND
             ${CMAKE_C_COMPILER} --target=nvptx64-nvidia-cuda
+            ${ARGN}
             -march=${LIBOMPTARGET_DEP_CUDA_ARCH}
             --cuda-path=${CUDA_ROOT}
             ${SRC_PATH} -o ${BIN_PATH}
@@ -21,6 +22,7 @@ macro(add_offload_test_device_code test_filename test_name)
         add_custom_command(OUTPUT ${BIN_PATH}
             COMMAND
             ${CMAKE_C_COMPILER} --target=amdgcn-amd-amdhsa -nogpulib
+            ${ARGN}
             -mcpu=${LIBOMPTARGET_DEP_AMDGPU_ARCH}
             ${SRC_PATH} -o ${BIN_PATH}
             DEPENDS ${SRC_PATH}
@@ -61,7 +63,9 @@ endif()
 
 add_offload_test_device_code(foo.c foo)
 add_offload_test_device_code(bar.c bar)
-add_offload_test_device_code(noargs.c noargs)
+# By default, amdhsa will add a number of "hidden" arguments to the kernel defintion
+# O3 disables this, and results in a kernel function with actually no arguments as seen by liboffload
+add_offload_test_device_code(noargs.c noargs -O3)
 
 add_custom_target(OffloadUnitTestsDeviceBins DEPENDS ${BIN_PATHS})
 

>From 6be750b5bd7f4356b6e7b44c2df13911651ae472 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 2 Jun 2025 10:42:52 +0100
Subject: [PATCH 4/4] On AMD, don't release kernel args if they haven't been
 allocated

---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 88f3b1ac5416c..9b1aede9f9737 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1261,9 +1261,11 @@ struct AMDGPUStreamTy {
     // Consume stream slot and compute dependencies.
     auto [Curr, InputSignal] = consume(OutputSignal);
 
-    // Setup the post action to release the kernel args buffer.
-    if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
-      return Err;
+    // Setup the post action to release the kernel args buffer, if it exists
+    if (KernelArgs) {
+      if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
+        return Err;
+    }
 
     // If we are running an RPC server we want to wake up the server thread
     // whenever there is a kernel running and let it sleep otherwise.