[llvm] [Offload] Add olLaunchKernelSuggestedGroupSize (PR #142130)

Fri May 30 05:00:23 PDT 2025

https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/142130

This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which
launches a kernel without specifying a work group size. Implementations
will use internal device specific magic to determine an ideal work group
size.

... Eventually anyway, for this change it is just hardcoded as
`{1, 1, 1}`.


>From 94d2c756d7c11c134f7bb9dbac69f05035e60d86 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 30 May 2025 12:57:54 +0100
Subject: [PATCH] [Offload] Add olLaunchKernelSuggestedGroupSize

This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which
launches a kernel without specifying a work group size. Implementations
will use internal device specific magic to determine an ideal work group
size.

... Eventually anyway, for this change it is just hardcoded as
`{1, 1, 1}`.
---
 offload/liboffload/API/Kernel.td              | 35 ++++++++
 .../liboffload/include/generated/OffloadAPI.h | 71 +++++++++++++++++
 .../include/generated/OffloadEntryPoints.inc  | 79 +++++++++++++++++++
 .../include/generated/OffloadFuncs.inc        |  2 +
 .../generated/OffloadImplFuncDecls.inc        |  6 ++
 .../include/generated/OffloadPrint.hpp        | 51 ++++++++++++
 offload/liboffload/src/OffloadImpl.cpp        | 63 +++++++++++----
 7 files changed, 293 insertions(+), 14 deletions(-)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..2ff75aa2ba002 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -59,3 +59,38 @@ def : Function {
         Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
     ];
 }
+
+
+def : Struct {
+    let name = "ol_kernel_launch_size_suggested_args_t";
+    let desc = "Size-related arguments for a kernel launch.";
+    let members = [
+        StructMember<"size_t", "Dimensions", "Number of work dimensions">,
+        StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">,
+        StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">,
+        StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">,
+        StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
+    ];
+}
+
+def : Function {
+    let name = "olLaunchKernelSuggestedGroupSize";
+    let desc = "Enqueue a kernel launch with the specified work items and parameters.";
+    let details = [
+        "Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes"
+    ];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+        Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
+        Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
+        Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [
+        Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
+        Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
+        Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
+    ];
+}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index a1d7540519e32..1752340615a82 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
     // [out][optional] optional recorded event for the enqueued operation
     ol_event_handle_t *EventOut);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Size-related arguments for a kernel launch.
+typedef struct ol_kernel_launch_size_suggested_args_t {
+  size_t Dimensions;      /// Number of work dimensions
+  size_t NumItemsX;       /// Number of work items on the X dimension
+  size_t NumItemsY;       /// Number of work items on the Y dimension
+  size_t NumItemsZ;       /// Number of work items on the Z dimension
+  size_t DynSharedMemory; /// Size of dynamic shared memory in bytes.
+} ol_kernel_launch_size_suggested_args_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a kernel launch with the specified work items and parameters.
+///
+/// @details
+///    - Behaves the same as olLaunchKernel, but the implementation
+///    automatically determines optimal work group sizes
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_ARGUMENT
+///         + `Queue == NULL && EventOut != NULL`
+///     - ::OL_ERRC_INVALID_ARGUMENT
+///         + `ArgumentsSize > 0 && ArgumentsData == NULL`
+///     - ::OL_ERRC_INVALID_DEVICE
+///         + If Queue is non-null but does not belong to Device
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == LaunchSizeArgs`
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+    // [in][optional] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] handle of the device to execute on
+    ol_device_handle_t Device,
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel,
+    // [in][optional] pointer to the kernel argument struct
+    const void *ArgumentsData,
+    // [in] size of the kernel argument struct
+    size_t ArgumentsSize,
+    // [in] pointer to the struct containing launch size parameters
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatformInfo
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t {
   ol_event_handle_t **pEventOut;
 } ol_launch_kernel_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olLaunchKernelSuggestedGroupSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_launch_kernel_suggested_group_size_params_t {
+  ol_queue_handle_t *pQueue;
+  ol_device_handle_t *pDevice;
+  ol_kernel_handle_t *pKernel;
+  const void **pArgumentsData;
+  size_t *pArgumentsSize;
+  const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs;
+  ol_event_handle_t **pEventOut;
+} ol_launch_kernel_suggested_group_size_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc(
     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source
+/// code location information
+/// @details See also ::olLaunchKernelSuggestedGroupSize
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 9feebeea09ec3..3b7c8be609c92 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc(
   currentCodeLocation() = nullptr;
   return Result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+llvm::Error olLaunchKernelSuggestedGroupSize_val(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  if (offloadConfig().ValidationEnabled) {
+    if (Queue == NULL && EventOut != NULL) {
+      return createOffloadError(
+          error::ErrorCode::INVALID_ARGUMENT,
+          "validation failure: Queue == NULL && EventOut != NULL");
+    }
+
+    if (ArgumentsSize > 0 && ArgumentsData == NULL) {
+      return createOffloadError(
+          error::ErrorCode::INVALID_ARGUMENT,
+          "validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
+    }
+
+    if (NULL == Device) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+                                "validation failure: NULL == Device");
+    }
+
+    if (NULL == Kernel) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+                                "validation failure: NULL == Kernel");
+    }
+
+    if (NULL == LaunchSizeArgs) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
+                                "validation failure: NULL == LaunchSizeArgs");
+    }
+  }
+
+  return llvm::offload::olLaunchKernelSuggestedGroupSize_impl(
+      Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+      EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    llvm::errs() << "---> olLaunchKernelSuggestedGroupSize";
+  }
+
+  ol_result_t Result =
+      llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val(
+          Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+          EventOut));
+
+  if (offloadConfig().TracingEnabled) {
+    ol_launch_kernel_suggested_group_size_params_t Params = {
+        &Queue,         &Device,         &Kernel,  &ArgumentsData,
+        &ArgumentsSize, &LaunchSizeArgs, &EventOut};
+    llvm::errs() << "(" << &Params << ")";
+    llvm::errs() << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      llvm::errs() << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = ::olLaunchKernelSuggestedGroupSize(
+      Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+      EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 78ff9ddb82799..48a1c73dad631 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram)
 OFFLOAD_FUNC(olDestroyProgram)
 OFFLOAD_FUNC(olGetKernel)
 OFFLOAD_FUNC(olLaunchKernel)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc)
@@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
 OFFLOAD_FUNC(olDestroyProgramWithCodeLoc)
 OFFLOAD_FUNC(olGetKernelWithCodeLoc)
 OFFLOAD_FUNC(olLaunchKernelWithCodeLoc)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 71d25dee87867..d8c94e59182bc 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           size_t ArgumentsSize,
                           const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                           ol_event_handle_t *EventOut);
+
+Error olLaunchKernelSuggestedGroupSize_impl(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 3aad6223d4dea..706f45987e662 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os,
   os << "}";
   return os;
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type
+/// @returns llvm::raw_ostream &
+
+inline llvm::raw_ostream &
+operator<<(llvm::raw_ostream &os,
+           const struct ol_kernel_launch_size_suggested_args_t params) {
+  os << "(struct ol_kernel_launch_size_suggested_args_t){";
+  os << ".Dimensions = ";
+  os << params.Dimensions;
+  os << ", ";
+  os << ".NumItemsX = ";
+  os << params.NumItemsX;
+  os << ", ";
+  os << ".NumItemsY = ";
+  os << params.NumItemsY;
+  os << ", ";
+  os << ".NumItemsZ = ";
+  os << params.NumItemsZ;
+  os << ", ";
+  os << ".DynSharedMemory = ";
+  os << params.DynSharedMemory;
+  os << "}";
+  return os;
+}
 
 inline llvm::raw_ostream &
 operator<<(llvm::raw_ostream &os,
@@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os,
   return os;
 }
 
+inline llvm::raw_ostream &operator<<(
+    llvm::raw_ostream &os,
+    const struct ol_launch_kernel_suggested_group_size_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  os << ", ";
+  os << ".ArgumentsData = ";
+  printPtr(os, *params->pArgumentsData);
+  os << ", ";
+  os << ".ArgumentsSize = ";
+  os << *params->pArgumentsSize;
+  os << ", ";
+  os << ".LaunchSizeArgs = ";
+  printPtr(os, *params->pLaunchSizeArgs);
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..8a57afa8522c5 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
-Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
-                          ol_kernel_handle_t Kernel, const void *ArgumentsData,
-                          size_t ArgumentsSize,
-                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
-                          ol_event_handle_t *EventOut) {
+namespace {
+Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                ol_kernel_handle_t Kernel, KernelArgsTy &Args,
+                ol_event_handle_t *EventOut) {
   auto *DeviceImpl = Device->Device;
   if (Queue && Device != Queue->Device) {
     return createOffloadError(
@@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
 
   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
+  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args,
+                                AsyncInfoWrapper);
+
+  AsyncInfoWrapper.finalize(Err);
+  if (Err)
+    return Err;
+
+  if (EventOut)
+    *EventOut = makeEvent(Queue);
+
+  return Error::success();
+}
+} // namespace
+
+Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                          ol_kernel_handle_t Kernel, const void *ArgumentsData,
+                          size_t ArgumentsSize,
+                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                          ol_event_handle_t *EventOut) {
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
   LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
@@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
-  auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
-                                LaunchArgs, AsyncInfoWrapper);
+  return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
+}
 
-  AsyncInfoWrapper.finalize(Err);
-  if (Err)
-    return Err;
+Error olLaunchKernelSuggestedGroupSize_impl(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  // TODO: Use backend specific magic to determine the best work group size
+  size_t PreferredSize[3] = {1, 1, 1};
 
-  if (EventOut)
-    *EventOut = makeEvent(Queue);
+  KernelArgsTy LaunchArgs{};
+  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0];
+  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1];
+  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2];
+  LaunchArgs.ThreadLimit[0] = PreferredSize[0];
+  LaunchArgs.ThreadLimit[1] = PreferredSize[1];
+  LaunchArgs.ThreadLimit[2] = PreferredSize[2];
+  LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
-  return Error::success();
+  KernelLaunchParamsTy Params;
+  Params.Data = const_cast<void *>(ArgumentsData);
+  Params.Size = ArgumentsSize;
+  LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
+  // Don't do anything with pointer indirection; use arg data as-is
+  LaunchArgs.Flags.IsCUDA = true;
+
+  return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
 }
 
 } // namespace offload