[llvm] [offload] Change olLaunchKernel to accept argument arrays (PR #173263)

Mon Dec 22 06:24:39 PST 2025

https://github.com/lplewa created https://github.com/llvm/llvm-project/pull/173263

olLaunchKernel previously accepted kernel arguments as a single contiguous buffer. While this was sufficient for CUDA and AMD plugins, Level Zero requires separate argument ptrs that cannot be derived from a flat buffer, without knowing the separate size of each argument.

This change updates the interface to accept an array of argument pointers and a corresponding array of argument sizes (void ** + int64_t *).


note: offload tests has to be updated to new api

>From af6a6f969f043eb58a1f5d9ed8d5c7a0ff7defe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Plewa?= <lukasz.plewa at intel.com>
Date: Thu, 18 Dec 2025 14:28:10 +0100
Subject: [PATCH] [offload] Change olLaunchKernel to accept argument arrays

olLaunchKernel previously accepted kernel arguments as a single
contiguous buffer. While this was sufficient for CUDA and AMD plugins,
Level Zero requires separate argument ptrs that cannot be derived from a
flat buffer, without knowing the separate size of each argument.

This change updates the interface to accept an array of argument
pointers and a corresponding array of argument sizes
(void ** + int64_t *).
---
 offload/liboffload/API/Kernel.td              | 35 +++++++++++++------
 offload/liboffload/src/OffloadImpl.cpp        | 14 ++++----
 .../common/src/PluginInterface.cpp            |  7 ++--
 .../level_zero/src/L0Kernel.cpp               |  9 ++---
 .../include/mathtest/DeviceContext.hpp        |  5 +--
 .../Conformance/lib/DeviceContext.cpp         | 10 +++---
 6 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 2f5692a19d712..270a361541489 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -26,18 +26,31 @@ def olLaunchKernel : Function {
         "If a queue is not specified, kernel execution happens synchronously",
         "ArgumentsData may be set to NULL (to indicate no parameters)"
     ];
-    let params = [
-        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
-        Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
-        Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
-        Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
-        Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
+    let params =
+        [Param<"ol_queue_handle_t", "Queue", "handle of the queue",
+               PARAM_IN_OPTIONAL>,
+         Param<"ol_device_handle_t", "Device",
+               "handle of the device to execute on", PARAM_IN>,
+         Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel",
+               PARAM_IN>,
+         Param<"const void**", "ArgumentsData",
+               "pointer to the kernel arguments array", PARAM_IN_OPTIONAL>,
+         Param<"const int64_t*", "ArgumentsSize",
+               "pointer to the kernel arguments sizes array",
+               PARAM_IN_OPTIONAL>,
+         Param<"uint32_t", "ArgumentsNum",
+               "Number of the elements in the arguments arrays", PARAM_IN>,
+         Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs",
+               "pointer to the struct containing launch size parameters",
+               PARAM_IN>,
     ];
-    let returns = [
-        Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
-        Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
-        Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+    let returns =
+        [Return<"OL_ERRC_INVALID_ARGUMENT",
+                ["`ArgumentsNum > 0 && (ArgumentsData == NULL || ArgumentsSize "
+                 "== NULL)`"]>,
+         Return<"OL_ERRC_INVALID_DEVICE",
+                ["If Queue is non-null but does not belong to Device"]>,
+         Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
     ];
 }
 
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 2eb7017bbc1a8..19d6a204069d4 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -1073,8 +1073,8 @@ Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
 }
 
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
-                          ol_symbol_handle_t Kernel, const void *ArgumentsData,
-                          size_t ArgumentsSize,
+                          ol_symbol_handle_t Kernel, const void **ArgumentsData,
+                          const int64_t *ArgumentsSizes, uint32_t ArgumentsNum,
                           const ol_kernel_launch_size_args_t *LaunchSizeArgs) {
   auto *DeviceImpl = Device->Device;
   if (Queue && Device != Queue->Device) {
@@ -1098,12 +1098,10 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
-  KernelLaunchParamsTy Params;
-  Params.Data = const_cast<void *>(ArgumentsData);
-  Params.Size = ArgumentsSize;
-  LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
-  // Don't do anything with pointer indirection; use arg data as-is
-  LaunchArgs.Flags.IsCUDA = true;
+  LaunchArgs.ArgPtrs = const_cast<void **>(ArgumentsData);
+  LaunchArgs.NumArgs = ArgumentsNum;
+  LaunchArgs.ArgSizes = const_cast<int64_t *>(ArgumentsSizes);
+  std::vector<void *> ArgPtrs;
 
   auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index fc5fe7529e3e5..fa074cb4d5bd7 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -581,8 +581,11 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
   }
 
   for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
-    Args[I] =
-        (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
+    if (ArgOffsets == nullptr)
+      Args[I] = ArgPtrs[I - KLEOffset];
+    else
+      Args[I] = (void *)((intptr_t)ArgPtrs[I - KLEOffset] +
+                         ArgOffsets[I - KLEOffset]);
     Ptrs[I] = &Args[I];
   }
   return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index e6d7bba305fd8..17657c6d35d00 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -449,13 +449,14 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   for (int32_t I = 0; I < NumArgs; I++) {
     // Scope code to ease integration with downstream custom code.
     {
-      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
-      CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
-                        Arg == nullptr ? nullptr : &Arg);
+      auto arg = KernelArgs.ArgPtrs[I];
+      CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I,
+                        KernelArgs.ArgSizes[I], arg);
+
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
            "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
            ") was set successfully for device %s.\n",
-           I, DPxPTR(Arg), IdStr);
+           I, DPxPTR(arg), IdStr);
     }
   }
 
diff --git a/offload/unittests/Conformance/include/mathtest/DeviceContext.hpp b/offload/unittests/Conformance/include/mathtest/DeviceContext.hpp
index 5c31fc3da53cd..2d34bce33f986 100644
--- a/offload/unittests/Conformance/include/mathtest/DeviceContext.hpp
+++ b/offload/unittests/Conformance/include/mathtest/DeviceContext.hpp
@@ -126,8 +126,9 @@ class DeviceContext {
                   llvm::StringRef KernelName) const noexcept;
 
   void launchKernelImpl(ol_symbol_handle_t KernelHandle, uint32_t NumGroups,
-                        uint32_t GroupSize, const void *KernelArgs,
-                        std::size_t KernelArgsSize) const noexcept;
+                        uint32_t GroupSize, const void **KernelArgs,
+                        const int64_t *KernelArgsSizes,
+                        std::size_t KernelArgsNum) const noexcept;
 
   std::size_t GlobalDeviceId;
   ol_device_handle_t DeviceHandle;
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index 6c3425f1e17c2..4a6a491f88084 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -286,9 +286,11 @@ DeviceContext::getKernelHandle(ol_program_handle_t ProgramHandle,
   return Handle;
 }
 
-void DeviceContext::launchKernelImpl(
-    ol_symbol_handle_t KernelHandle, uint32_t NumGroups, uint32_t GroupSize,
-    const void *KernelArgs, std::size_t KernelArgsSize) const noexcept {
+void DeviceContext::launchKernelImpl(ol_symbol_handle_t KernelHandle,
+                                     uint32_t NumGroups, uint32_t GroupSize,
+                                     const void **KernelArgs,
+                                     const int64_t *KernelArgsSizes,
+                                     std::size_t KernelArgsNum) const noexcept {
   ol_kernel_launch_size_args_t LaunchSizeArgs;
   LaunchSizeArgs.Dimensions = 1;
   LaunchSizeArgs.NumGroups = {NumGroups, 1, 1};
@@ -296,7 +298,7 @@ void DeviceContext::launchKernelImpl(
   LaunchSizeArgs.DynSharedMemory = 0;
 
   OL_CHECK(olLaunchKernel(nullptr, DeviceHandle, KernelHandle, KernelArgs,
-                          KernelArgsSize, &LaunchSizeArgs));
+                          KernelArgsSizes, KernelArgsNum, &LaunchSizeArgs));
 }
 
 [[nodiscard]] llvm::StringRef DeviceContext::getName() const noexcept {