[llvm] [Offload] Add olLaunchKernelSuggestedGroupSize (PR #142130)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Fri May 30 05:00:23 PDT 2025
https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/142130
This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which
launches a kernel without specifying a work group size. Implementations
will use internal device specific magic to determine an ideal work group
size.
... Eventually anyway, for this change it is just hardcoded as
`{1, 1, 1}`.
>From 94d2c756d7c11c134f7bb9dbac69f05035e60d86 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 30 May 2025 12:57:54 +0100
Subject: [PATCH] [Offload] Add olLaunchKernelSuggestedGroupSize
This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which
launches a kernel without specifying a work group size. Implementations
will use internal device specific magic to determine an ideal work group
size.
... Eventually anyway, for this change it is just hardcoded as
`{1, 1, 1}`.
---
offload/liboffload/API/Kernel.td | 35 ++++++++
.../liboffload/include/generated/OffloadAPI.h | 71 +++++++++++++++++
.../include/generated/OffloadEntryPoints.inc | 79 +++++++++++++++++++
.../include/generated/OffloadFuncs.inc | 2 +
.../generated/OffloadImplFuncDecls.inc | 6 ++
.../include/generated/OffloadPrint.hpp | 51 ++++++++++++
offload/liboffload/src/OffloadImpl.cpp | 63 +++++++++++----
7 files changed, 293 insertions(+), 14 deletions(-)
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..2ff75aa2ba002 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -59,3 +59,38 @@ def : Function {
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
];
}
+
+
+def : Struct {
+ let name = "ol_kernel_launch_size_suggested_args_t";
+ let desc = "Size-related arguments for a kernel launch.";
+ let members = [
+ StructMember<"size_t", "Dimensions", "Number of work dimensions">,
+ StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">,
+ StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">,
+ StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">,
+ StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
+ ];
+}
+
+def : Function {
+ let name = "olLaunchKernelSuggestedGroupSize";
+ let desc = "Enqueue a kernel launch with the specified work items and parameters.";
+ let details = [
+ "Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes"
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+ Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
+ Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+ Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
+ Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
+ Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
+ Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
+ Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
+ Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
+ ];
+}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index a1d7540519e32..1752340615a82 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
// [out][optional] optional recorded event for the enqueued operation
ol_event_handle_t *EventOut);
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Size-related arguments for a kernel launch.
+typedef struct ol_kernel_launch_size_suggested_args_t {
+ size_t Dimensions; /// Number of work dimensions
+ size_t NumItemsX; /// Number of work items on the X dimension
+ size_t NumItemsY; /// Number of work items on the Y dimension
+ size_t NumItemsZ; /// Number of work items on the Z dimension
+ size_t DynSharedMemory; /// Size of dynamic shared memory in bytes.
+} ol_kernel_launch_size_suggested_args_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a kernel launch with the specified work items and parameters.
+///
+/// @details
+/// - Behaves the same as olLaunchKernel, but the implementation
+/// automatically determines optimal work group sizes
+///
+/// @returns
+/// - ::OL_RESULT_SUCCESS
+/// - ::OL_ERRC_UNINITIALIZED
+/// - ::OL_ERRC_DEVICE_LOST
+/// - ::OL_ERRC_INVALID_ARGUMENT
+/// + `Queue == NULL && EventOut != NULL`
+/// - ::OL_ERRC_INVALID_ARGUMENT
+/// + `ArgumentsSize > 0 && ArgumentsData == NULL`
+/// - ::OL_ERRC_INVALID_DEVICE
+/// + If Queue is non-null but does not belong to Device
+/// - ::OL_ERRC_INVALID_NULL_HANDLE
+/// + `NULL == Device`
+/// + `NULL == Kernel`
+/// - ::OL_ERRC_INVALID_NULL_POINTER
+/// + `NULL == LaunchSizeArgs`
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+ // [in][optional] handle of the queue
+ ol_queue_handle_t Queue,
+ // [in] handle of the device to execute on
+ ol_device_handle_t Device,
+ // [in] handle of the kernel
+ ol_kernel_handle_t Kernel,
+ // [in][optional] pointer to the kernel argument struct
+ const void *ArgumentsData,
+ // [in] size of the kernel argument struct
+ size_t ArgumentsSize,
+ // [in] pointer to the struct containing launch size parameters
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ // [out][optional] optional recorded event for the enqueued operation
+ ol_event_handle_t *EventOut);
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for olGetPlatformInfo
/// @details Each entry is a pointer to the parameter passed to the function;
@@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t {
ol_event_handle_t **pEventOut;
} ol_launch_kernel_params_t;
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olLaunchKernelSuggestedGroupSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_launch_kernel_suggested_group_size_params_t {
+ ol_queue_handle_t *pQueue;
+ ol_device_handle_t *pDevice;
+ ol_kernel_handle_t *pKernel;
+ const void **pArgumentsData;
+ size_t *pArgumentsSize;
+ const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs;
+ ol_event_handle_t **pEventOut;
+} ol_launch_kernel_suggested_group_size_params_t;
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Variant of olInit that also sets source code location information
/// @details See also ::olInit
@@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc(
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source
+/// code location information
+/// @details See also ::olLaunchKernelSuggestedGroupSize
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
#if defined(__cplusplus)
} // extern "C"
#endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 9feebeea09ec3..3b7c8be609c92 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc(
currentCodeLocation() = nullptr;
return Result;
}
+
+///////////////////////////////////////////////////////////////////////////////
+llvm::Error olLaunchKernelSuggestedGroupSize_val(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut) {
+ if (offloadConfig().ValidationEnabled) {
+ if (Queue == NULL && EventOut != NULL) {
+ return createOffloadError(
+ error::ErrorCode::INVALID_ARGUMENT,
+ "validation failure: Queue == NULL && EventOut != NULL");
+ }
+
+ if (ArgumentsSize > 0 && ArgumentsData == NULL) {
+ return createOffloadError(
+ error::ErrorCode::INVALID_ARGUMENT,
+ "validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
+ }
+
+ if (NULL == Device) {
+ return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+ "validation failure: NULL == Device");
+ }
+
+ if (NULL == Kernel) {
+ return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+ "validation failure: NULL == Kernel");
+ }
+
+ if (NULL == LaunchSizeArgs) {
+ return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
+ "validation failure: NULL == LaunchSizeArgs");
+ }
+ }
+
+ return llvm::offload::olLaunchKernelSuggestedGroupSize_impl(
+ Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+ EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut) {
+ if (offloadConfig().TracingEnabled) {
+ llvm::errs() << "---> olLaunchKernelSuggestedGroupSize";
+ }
+
+ ol_result_t Result =
+ llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val(
+ Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+ EventOut));
+
+ if (offloadConfig().TracingEnabled) {
+ ol_launch_kernel_suggested_group_size_params_t Params = {
+ &Queue, &Device, &Kernel, &ArgumentsData,
+ &ArgumentsSize, &LaunchSizeArgs, &EventOut};
+ llvm::errs() << "(" << &Params << ")";
+ llvm::errs() << "-> " << Result << "\n";
+ if (Result && Result->Details) {
+ llvm::errs() << " *Error Details* " << Result->Details << " \n";
+ }
+ }
+ return Result;
+}
+ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
+ currentCodeLocation() = CodeLocation;
+ ol_result_t Result = ::olLaunchKernelSuggestedGroupSize(
+ Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+ EventOut);
+
+ currentCodeLocation() = nullptr;
+ return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 78ff9ddb82799..48a1c73dad631 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram)
OFFLOAD_FUNC(olDestroyProgram)
OFFLOAD_FUNC(olGetKernel)
OFFLOAD_FUNC(olLaunchKernel)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize)
OFFLOAD_FUNC(olInitWithCodeLoc)
OFFLOAD_FUNC(olShutDownWithCodeLoc)
OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc)
@@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
OFFLOAD_FUNC(olDestroyProgramWithCodeLoc)
OFFLOAD_FUNC(olGetKernelWithCodeLoc)
OFFLOAD_FUNC(olLaunchKernelWithCodeLoc)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc)
#undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 71d25dee87867..d8c94e59182bc 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
size_t ArgumentsSize,
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut);
+
+Error olLaunchKernelSuggestedGroupSize_impl(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 3aad6223d4dea..706f45987e662 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os,
os << "}";
return os;
}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type
+/// @returns llvm::raw_ostream &
+
+inline llvm::raw_ostream &
+operator<<(llvm::raw_ostream &os,
+ const struct ol_kernel_launch_size_suggested_args_t params) {
+ os << "(struct ol_kernel_launch_size_suggested_args_t){";
+ os << ".Dimensions = ";
+ os << params.Dimensions;
+ os << ", ";
+ os << ".NumItemsX = ";
+ os << params.NumItemsX;
+ os << ", ";
+ os << ".NumItemsY = ";
+ os << params.NumItemsY;
+ os << ", ";
+ os << ".NumItemsZ = ";
+ os << params.NumItemsZ;
+ os << ", ";
+ os << ".DynSharedMemory = ";
+ os << params.DynSharedMemory;
+ os << "}";
+ return os;
+}
inline llvm::raw_ostream &
operator<<(llvm::raw_ostream &os,
@@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os,
return os;
}
+inline llvm::raw_ostream &operator<<(
+ llvm::raw_ostream &os,
+ const struct ol_launch_kernel_suggested_group_size_params_t *params) {
+ os << ".Queue = ";
+ printPtr(os, *params->pQueue);
+ os << ", ";
+ os << ".Device = ";
+ printPtr(os, *params->pDevice);
+ os << ", ";
+ os << ".Kernel = ";
+ printPtr(os, *params->pKernel);
+ os << ", ";
+ os << ".ArgumentsData = ";
+ printPtr(os, *params->pArgumentsData);
+ os << ", ";
+ os << ".ArgumentsSize = ";
+ os << *params->pArgumentsSize;
+ os << ", ";
+ os << ".LaunchSizeArgs = ";
+ printPtr(os, *params->pLaunchSizeArgs);
+ os << ", ";
+ os << ".EventOut = ";
+ printPtr(os, *params->pEventOut);
+ return os;
+}
+
///////////////////////////////////////////////////////////////////////////////
// @brief Print pointer value
template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..8a57afa8522c5 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
return Error::success();
}
-Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
- ol_kernel_handle_t Kernel, const void *ArgumentsData,
- size_t ArgumentsSize,
- const ol_kernel_launch_size_args_t *LaunchSizeArgs,
- ol_event_handle_t *EventOut) {
+namespace {
+Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, KernelArgsTy &Args,
+ ol_event_handle_t *EventOut) {
auto *DeviceImpl = Device->Device;
if (Queue && Device != Queue->Device) {
return createOffloadError(
@@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
+ auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+ auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args,
+ AsyncInfoWrapper);
+
+ AsyncInfoWrapper.finalize(Err);
+ if (Err)
+ return Err;
+
+ if (EventOut)
+ *EventOut = makeEvent(Queue);
+
+ return Error::success();
+}
+} // namespace
+
+Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData,
+ size_t ArgumentsSize,
+ const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut) {
KernelArgsTy LaunchArgs{};
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
@@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
// Don't do anything with pointer indirection; use arg data as-is
LaunchArgs.Flags.IsCUDA = true;
- auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
- auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
- LaunchArgs, AsyncInfoWrapper);
+ return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
+}
- AsyncInfoWrapper.finalize(Err);
- if (Err)
- return Err;
+Error olLaunchKernelSuggestedGroupSize_impl(
+ ol_queue_handle_t Queue, ol_device_handle_t Device,
+ ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+ const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+ ol_event_handle_t *EventOut) {
+ // TODO: Use backend specific magic to determine the best work group size
+ size_t PreferredSize[3] = {1, 1, 1};
- if (EventOut)
- *EventOut = makeEvent(Queue);
+ KernelArgsTy LaunchArgs{};
+ LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0];
+ LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1];
+ LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2];
+ LaunchArgs.ThreadLimit[0] = PreferredSize[0];
+ LaunchArgs.ThreadLimit[1] = PreferredSize[1];
+ LaunchArgs.ThreadLimit[2] = PreferredSize[2];
+ LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
- return Error::success();
+ KernelLaunchParamsTy Params;
+ Params.Data = const_cast<void *>(ArgumentsData);
+ Params.Size = ArgumentsSize;
+ LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
+ // Don't do anything with pointer indirection; use arg data as-is
+ LaunchArgs.Flags.IsCUDA = true;
+
+ return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
}
} // namespace offload
More information about the llvm-commits
mailing list