[llvm] [offload] Add properties parameter to olLaunchKernel (PR #184343)
Ćukasz Plewa via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 03:42:34 PST 2026
https://github.com/lplewa updated https://github.com/llvm/llvm-project/pull/184343
>From 41d3cb2ea25d60f9267d264ca00ce49329ea1892 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Plewa?= <lukasz.plewa at intel.com>
Date: Thu, 18 Dec 2025 14:28:10 +0100
Subject: [PATCH] [offload] Add properties parameter to olLaunchKernel
Introduce a properties argument to olLaunchKernel to enable future
extensions.
This change adds initial support for:
- cooperative kernel launch
- extended kernel argument size (required by the L0 plugin)
---
offload/include/Shared/APITypes.h | 9 +-
offload/liboffload/API/Device.td | 9 +-
offload/liboffload/API/Kernel.td | 73 +++++++++--
offload/liboffload/src/OffloadImpl.cpp | 61 ++++++++-
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 5 +
.../common/include/PluginInterface.h | 9 ++
.../cuda/dynamic_cuda/cuda.cpp | 3 +-
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 48 ++++++-
offload/plugins-nextgen/cuda/src/rtl.cpp | 80 ++++++++++--
offload/plugins-nextgen/host/src/rtl.cpp | 6 +
.../level_zero/include/L0Context.h | 5 +
.../level_zero/include/L0Device.h | 8 ++
.../level_zero/include/L0Kernel.h | 13 +-
.../level_zero/src/L0Context.cpp | 7 +
.../level_zero/src/L0Device.cpp | 27 ++++
.../level_zero/src/L0Kernel.cpp | 121 +++++++++++++++---
offload/tools/offload-tblgen/PrintGen.cpp | 7 +-
.../Conformance/lib/DeviceContext.cpp | 2 +-
offload/unittests/OffloadAPI/CMakeLists.txt | 3 +-
.../unittests/OffloadAPI/common/Fixtures.hpp | 37 ++++++
.../OffloadAPI/kernel/olLaunchKernel.cpp | 79 +++---------
.../kernel/olLaunchKernelCooperative.cpp | 120 +++++++++++++++++
.../unittests/OffloadAPI/memory/olMemcpy.cpp | 6 +-
.../OffloadAPI/queue/olLaunchHostFunction.cpp | 4 +-
.../OffloadAPI/queue/olWaitEvents.cpp | 6 +-
25 files changed, 630 insertions(+), 118 deletions(-)
create mode 100644 offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..9213f2924f1f7 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -100,10 +100,11 @@ struct KernelArgsTy {
uint64_t Tripcount =
0; // Tripcount for the teams / distribute loop, 0 otherwise.
struct {
- uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
- uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
- uint64_t Unused : 62;
- } Flags = {0, 0, 0};
+ uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
+ uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+ uint64_t Cooperative : 1; // Was this kernel spawned as cooperative.
+ uint64_t Unused : 61;
+ } Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 6ada191089674..7790386ae02e1 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -47,7 +47,14 @@ def ol_device_info_t : Enum {
];
list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
- let etors = !listconcat(basic_etors, fp_configs, native_vec_widths);
+ // This list is maintained separately to allow adding new basic etors without
+ // changing the values of previous ones.
+ list<TaggedEtor> basic_etors2 =
+ [TaggedEtor<"COOPERATIVE_LAUNCH_SUPPORT", "bool",
+ "Is cooperative kernel launch supported">,
+ ];
+ let etors =
+ !listconcat(basic_etors, fp_configs, native_vec_widths, basic_etors2);
}
def ol_device_fp_capability_flag_t : Enum {
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 2f5692a19d712..1e713bed1e903 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -20,20 +20,52 @@ def ol_kernel_launch_size_args_t : Struct {
];
}
+def OL_KERNEL_LAUNCH_PROP_END : Macro {
+ let desc = "last element of the ol_kernel_launch_prop_t array";
+ let value = "{OL_KERNEL_LAUNCH_PROP_TYPE_NONE, NULL}";
+}
+
+def ol_kernel_launch_prop_type_t : Enum {
+ let desc = "Defines structure type";
+ let is_typed = 1;
+ let etors =
+ [TaggedEtor<"none", "void *", "Used for null terminating property array">,
+ TaggedEtor<"size", "size_t *", "Array of the arguments sizes.">,
+ TaggedEtor<"is_cooperative ", "bool *", "Cooperative kernel launch">];
+}
+
+def ol_kernel_launch_prop_t : Struct {
+ let desc = "Optional properties for kernel launch.";
+ let members = [StructMember<"ol_kernel_launch_prop_type_t", "type",
+ "Type of the data field">,
+ StructMember<"void *", "data",
+ "Pointer to property-specific data.">];
+}
+
def olLaunchKernel : Function {
let desc = "Enqueue a kernel launch with the specified size and parameters.";
let details = [
"If a queue is not specified, kernel execution happens synchronously",
"ArgumentsData may be set to NULL (to indicate no parameters)"
];
- let params = [
- Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
- Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
- Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
- Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
- Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
- Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
- ];
+ let params =
+ [Param<"ol_queue_handle_t", "Queue", "handle of the queue",
+ PARAM_IN_OPTIONAL>,
+ Param<"ol_device_handle_t", "Device",
+ "handle of the device to execute on", PARAM_IN>,
+ Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel",
+ PARAM_IN>,
+ Param<"const void*", "ArgumentsData",
+ "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
+ Param<"size_t", "ArgumentsSize", "size of the kernel argument struct",
+ PARAM_IN>,
+ Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs",
+ "pointer to the struct containing launch size parameters",
+ PARAM_IN>,
+ Param<"const ol_kernel_launch_prop_t *", "Properties",
+ "Array of optional properties, last element must be "
+ "OL_KERNEL_LAUNCH_PROP_END",
+ PARAM_IN_OPTIONAL>];
let returns = [
Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
@@ -57,3 +89,28 @@ def olCalculateOptimalOccupancy : Function {
Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
];
}
+
+def olGetKernelMaxCooperativeGroupCount : Function {
+ let desc = "Query the maximum number of work groups that can be launched "
+ "cooperatively for a kernel.";
+ let details =
+ ["This function returns the maximum number of work groups that can "
+ "participate in a cooperative launch for the given kernel.",
+ "The maximum count depends on the work group size and dynamic shared "
+ "memory usage.",
+ ];
+ let params = [Param<"ol_device_handle_t", "Device",
+ "device intended to run the kernel", PARAM_IN>,
+ Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel",
+ PARAM_IN>,
+ Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs",
+ "launch size parameters", PARAM_IN>,
+ Param<"uint32_t*", "MaxGroupCount",
+ "maximum number of cooperative groups", PARAM_OUT>];
+ let returns =
+ [Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+ Return<
+ "OL_ERRC_UNSUPPORTED", ["Cooperative launch is not supported or "
+ "backend cannot provide this information"]>,
+ ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index dd3ec0f61b4da..7b5bbb6088c4b 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -496,7 +496,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
"plugin returned incorrect type");
return Info.writeString(std::get<std::string>(Entry->Value).c_str());
}
-
+ case OL_DEVICE_INFO_COOPERATIVE_LAUNCH_SUPPORT: {
+ // Bool value
+ if (!std::holds_alternative<bool>(Entry->Value))
+ return makeError(ErrorCode::BACKEND_FAILURE,
+ "plugin returned incorrect type");
+ return Info.write(static_cast<uint8_t>(std::get<bool>(Entry->Value)));
+ }
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
case OL_DEVICE_INFO_MAX_WORK_SIZE:
case OL_DEVICE_INFO_VENDOR_ID:
@@ -1032,10 +1038,39 @@ Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
return Error::success();
}
+Error olGetKernelMaxCooperativeGroupCount_impl(
+ ol_device_handle_t Device, ol_symbol_handle_t Kernel,
+ const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+ uint32_t *MaxGroupCount) {
+ if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
+ return createOffloadError(ErrorCode::SYMBOL_KIND,
+ "provided symbol is not a kernel");
+
+ GenericDeviceTy *DeviceImpl = Device->Device;
+ auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
+
+ // Extract work group size from LaunchSizeArgs
+ size_t LocalWorkSize[3];
+ LocalWorkSize[0] = LaunchSizeArgs->GroupSize.x;
+ LocalWorkSize[1] = LaunchSizeArgs->GroupSize.y;
+ LocalWorkSize[2] = LaunchSizeArgs->GroupSize.z;
+
+ auto Res = KernelImpl->getMaxCooperativeGroupCount(
+ *DeviceImpl, LaunchSizeArgs->Dimensions, LocalWorkSize,
+ LaunchSizeArgs->DynSharedMemory);
+ if (auto Err = Res.takeError())
+ return Err;
+
+ *MaxGroupCount = *Res;
+
+ return Error::success();
+}
+
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_symbol_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,
- const ol_kernel_launch_size_args_t *LaunchSizeArgs) {
+ const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+ const ol_kernel_launch_prop_t *Properties) {
auto *DeviceImpl = Device->Device;
if (Queue && Device != Queue->Device) {
return createOffloadError(
@@ -1048,7 +1083,6 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
"provided symbol is not a kernel");
auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
- AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
KernelArgsTy LaunchArgs{};
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroups.x;
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroups.y;
@@ -1058,6 +1092,27 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
+ while (Properties && Properties->type != OL_KERNEL_LAUNCH_PROP_TYPE_NONE) {
+ switch (Properties->type) {
+ case OL_KERNEL_LAUNCH_PROP_TYPE_SIZE:
+ // This API requests size_t, while KernelArgsTy uses int64_t.
+ // TODO: We might want to change this struct to also use const size_t
+ LaunchArgs.ArgSizes = const_cast<int64_t *>(
+ reinterpret_cast<const int64_t *>(Properties->data));
+ break;
+ case OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE:
+ LaunchArgs.Flags.Cooperative =
+ *reinterpret_cast<const bool *>(Properties->data);
+ break;
+ default:
+ return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+ "olLaunchKernel property enum '%i' is invalid",
+ Properties->type);
+ }
+ Properties++;
+ }
+
+ AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
KernelLaunchParamsTy Params;
Params.Data = const_cast<void *>(ArgumentsData);
Params.Size = ArgumentsSize;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 379c8ec11225d..fb413f2363821 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3737,6 +3737,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ // Cooperative kernel launch is not yet supported for AMDGPU
+ if (KernelArgs.Flags.Cooperative)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "cooperative kernel launch not supported for AMDGPU");
+
AMDGPUPluginTy &AMDGPUPlugin =
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 1c59ed1eda841..fcd08aeff192e 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -370,6 +370,15 @@ struct GenericKernelTy {
virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
uint64_t DynamicMemSize) const = 0;
+ /// Get the maximum number of work groups that can be launched cooperatively.
+ virtual Expected<uint32_t>
+ getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+ const size_t *LocalWorkSize,
+ size_t DynamicSharedMemorySize) const {
+ return Plugin::error(error::ErrorCode::UNSUPPORTED,
+ "cooperative launch not supported");
+ }
+
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 80e3e418ae3fa..d0916789d0f57 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -42,7 +42,7 @@ DLWRAP(cuDeviceTotalMem, 2)
DLWRAP(cuDriverGetVersion, 1)
DLWRAP(cuGetErrorString, 2)
-DLWRAP(cuLaunchKernel, 11)
+DLWRAP(cuLaunchKernelEx, 4)
DLWRAP(cuLaunchHostFunc, 3)
DLWRAP(cuMemAlloc, 2)
@@ -83,6 +83,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
+DLWRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor, 4)
DLWRAP(cuFuncGetParamInfo, 4)
DLWRAP(cuDeviceCanAccessPeer, 3)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 7e42c66dddabb..eb149be05777d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -295,6 +295,48 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
typedef size_t (*CUoccupancyB2DSize)(int);
+typedef enum CUlaunchAttributeID_enum {
+ CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+ CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1,
+ CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2,
+ CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+ CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4,
+ CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+ CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6,
+ CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7,
+ CU_LAUNCH_ATTRIBUTE_PRIORITY = 8,
+ CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9,
+ CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10,
+ CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
+ CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
+ CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
+ CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14
+} CUlaunchAttributeID;
+
+typedef union CUlaunchAttributeValue_union {
+ char pad[64];
+ int cooperative;
+} CUlaunchAttributeValue;
+
+typedef struct CUlaunchAttribute_st {
+ CUlaunchAttributeID id;
+ char pad[8 - sizeof(CUlaunchAttributeID)];
+ CUlaunchAttributeValue value;
+} CUlaunchAttribute;
+
+typedef struct CUlaunchConfig_st {
+ unsigned int gridDimX;
+ unsigned int gridDimY;
+ unsigned int gridDimZ;
+ unsigned int blockDimX;
+ unsigned int blockDimY;
+ unsigned int blockDimZ;
+ unsigned int sharedMemBytes;
+ CUstream hStream;
+ CUlaunchAttribute *attrs;
+ unsigned int numAttrs;
+} CUlaunchConfig;
+
CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -310,9 +352,7 @@ CUresult cuDriverGetVersion(int *);
CUresult cuGetErrorString(CUresult, const char **);
CUresult cuInit(unsigned);
-CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
- unsigned, unsigned, unsigned, CUstream, void **,
- void **);
+CUresult cuLaunchKernelEx(const CUlaunchConfig *, CUfunction, void **, void **);
CUresult cuLaunchHostFunc(CUstream, CUhostFn, void *);
CUresult cuMemAlloc(CUdeviceptr *, size_t);
@@ -390,6 +430,8 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
+CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int *, CUfunction, int,
+ size_t);
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
#endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index d5ab0b3309c86..d0b56ee364f56 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -174,6 +174,12 @@ struct CUDAKernelTy : public GenericKernelTy {
return MaxBlockSize;
}
+ /// Get maximum cooperative group count
+ Expected<uint32_t>
+ getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+ const size_t *LocalWorkSize,
+ size_t DynamicSharedMemorySize) const override;
+
private:
/// Initialize the size of the arguments.
Error initArgsSize() {
@@ -1258,7 +1264,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
if (Res == CUDA_SUCCESS)
- Info.add("Cooperative Launch", (bool)TmpInt);
+ Info.add("Cooperative Launch", bool(TmpInt), "",
+ DeviceInfo::COOPERATIVE_LAUNCH_SUPPORT);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
if (Res == CUDA_SUCCESS)
@@ -1495,9 +1502,17 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
MaxDynCGroupMemLimit = MaxDynCGroupMem;
}
- CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
- NumThreads[0], NumThreads[1], NumThreads[2],
- MaxDynCGroupMem, Stream, nullptr, Config);
+ CUlaunchAttribute CoopAttr;
+ CoopAttr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+ CoopAttr.value.cooperative = KernelArgs.Flags.Cooperative;
+
+ CUlaunchConfig LaunchConfig = {NumBlocks[0], NumBlocks[1],
+ NumBlocks[2], NumThreads[0],
+ NumThreads[1], NumThreads[2],
+ MaxDynCGroupMem, Stream,
+ &CoopAttr, 1};
+
+ CUresult Res = cuLaunchKernelEx(&LaunchConfig, Func, nullptr, Config);
// Register a callback to indicate when the kernel is complete.
if (GenericDevice.getRPCServer())
@@ -1509,15 +1524,64 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
},
&GenericDevice.Plugin);
- return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
+ return Plugin::check(Res, "error in cuLaunchKernelEx for '%s': %s",
+ getName());
+}
+
+Expected<uint32_t> CUDAKernelTy::getMaxCooperativeGroupCount(
+ GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+ const size_t *LocalWorkSize, size_t DynamicSharedMemorySize) const {
+ CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+
+ uint32_t SupportsCooperative = 0;
+ if (auto Err = CUDADevice.getDeviceAttr(
+ CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, SupportsCooperative))
+ return Err;
+
+ if (!SupportsCooperative)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "device does not support cooperative launch");
+
+ // Calculate total local work size
+ size_t LocalWorkSizeTotal = LocalWorkSize[0];
+ LocalWorkSizeTotal *= (WorkDim >= 2 ? LocalWorkSize[1] : 1);
+ LocalWorkSizeTotal *= (WorkDim == 3 ? LocalWorkSize[2] : 1);
+
+ // Query max active blocks per multiprocessor
+ int32_t MaxNumActiveGroupsPerCU = 0;
+ CUresult Res = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+ &MaxNumActiveGroupsPerCU, Func, LocalWorkSizeTotal,
+ DynamicSharedMemorySize);
+ if (auto Err = Plugin::check(
+ Res, "error in cuOccupancyMaxActiveBlocksPerMultiprocessor: %s"))
+ return Err;
+
+ assert(MaxNumActiveGroupsPerCU >= 0);
+
+ // Handle the case where we can't have all SMs active with at least 1 group
+ // per SM. In that case, the device is still able to run 1 work-group, hence
+ // we will manually check if it is possible with the available HW resources.
+ if (MaxNumActiveGroupsPerCU == 0)
+ // Check if we can launch at least 1 work-group
+ return (LocalWorkSizeTotal <= MaxNumThreads &&
+ DynamicSharedMemorySize <= CUDADevice.getMaxBlockSharedMemSize());
+
+ // Multiply by the number of multiprocessors (compute units) on the device
+ uint32_t NumMultiprocessors = 0;
+ if (auto Err = CUDADevice.getDeviceAttr(
+ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, NumMultiprocessors))
+ return Err;
+
+ return NumMultiprocessors * MaxNumActiveGroupsPerCU;
}
-/// Class implementing the CUDA-specific functionalities of the global handler.
+/// Class implementing the CUDA-specific functionalities of the global
+/// handler.
class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
public:
/// Get the metadata of a global from the device. The name and size of the
- /// global is read from DeviceGlobal and the address of the global is written
- /// to DeviceGlobal.
+ /// global is read from DeviceGlobal and the address of the global is
+ /// written to DeviceGlobal.
Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
DeviceImageTy &Image,
GlobalTy &DeviceGlobal) override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index df2b6f2c1dba9..91dabfbabd0d5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -103,6 +103,12 @@ struct GenELF64KernelTy : public GenericKernelTy {
if (!SupportsFFI)
return Plugin::error(ErrorCode::UNSUPPORTED,
"libffi is not available, cannot launch kernel");
+
+ // Cooperative kernel launch is not supported for host
+ if (KernelArgs.Flags.Cooperative)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "cooperative kernel launch not supported for host");
+
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 57560f4fd0824..14cb6da9c8da6 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -135,6 +135,11 @@ class L0ContextTy {
const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+
+ /// Level Zero extension function pointer for kernel argument size query.
+ ze_result_t(ZE_APICALL *zexKernelGetArgumentSize)(
+ ze_kernel_handle_t hKernel, uint32_t argIndex,
+ uint32_t *pArgSize) = nullptr;
};
} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 001a41ba77d7b..f6f19284655eb 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -217,6 +217,9 @@ class L0DeviceTy final : public GenericDeviceTy {
bool IsAsyncEnabled = false;
+ /// Whether the device supports cooperative kernels.
+ bool SupportsCooperativeKernels = false;
+
/// Lock for this device.
std::mutex Mutex;
@@ -238,6 +241,9 @@ class L0DeviceTy final : public GenericDeviceTy {
/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair.
std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+ /// Check if device supports cooperative kernels.
+ bool checkCooperativeKernelSupport();
+
public:
L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
@@ -269,6 +275,8 @@ class L0DeviceTy final : public GenericDeviceTy {
Error deinitImpl() override;
ze_device_handle_t getZeDevice() const { return zeDevice; }
+ bool supportsCooperativeKernels() const { return SupportsCooperativeKernels; }
+
const L0ContextTy &getL0Context() const { return l0Context; }
L0ContextTy &getL0Context() { return l0Context; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 1d5a014d9d0a5..c1737655a44db 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -61,6 +61,8 @@ struct KernelPropertiesTy {
/// Cached input parameters used in the previous launch.
int32_t NumTeams = -1;
int32_t ThreadLimit = -1;
+ uint32_t NumKernelArgs = 0;
+ std::unique_ptr<uint32_t[]> ArgSizes;
/// Cached parameters used in the previous launch.
ze_kernel_indirect_access_flags_t IndirectAccessFlags =
@@ -81,15 +83,17 @@ struct KernelPropertiesTy {
struct L0LaunchEnvTy {
bool IsAsync;
+ bool IsCooperative = false;
AsyncQueueTy *AsyncQueue;
ze_group_count_t GroupCounts = {0, 0, 0};
KernelPropertiesTy &KernelPR;
bool HalfNumThreads = false;
bool IsTeamsNDRange = false;
- L0LaunchEnvTy(bool IsAsync, AsyncQueueTy *AsyncQueue,
+ L0LaunchEnvTy(bool IsAsync, bool IsCooperative, AsyncQueueTy *AsyncQueue,
KernelPropertiesTy &KernelPR)
- : IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR) {}
+ : IsAsync(IsAsync), IsCooperative(IsCooperative), AsyncQueue(AsyncQueue),
+ KernelPR(KernelPR) {}
};
class L0KernelTy : public GenericKernelTy {
@@ -138,6 +142,11 @@ class L0KernelTy : public GenericKernelTy {
"maxGroupSize not implemented yet");
}
+ Expected<uint32_t>
+ getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+ const size_t *LocalWorkSize,
+ size_t DynamicSharedMemorySize) const override;
+
ze_kernel_handle_t getZeKernel() const { return zeKernel; }
Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
index 3d1588bc5fecc..158da8e7136f0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Context.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -26,6 +26,13 @@ Error L0ContextTy::init() {
return Err;
if (auto Err = HostMemAllocator.initHostPool(*this, Plugin.getOptions()))
return Err;
+
+ ze_result_t RC;
+ CALL_ZE(RC, zeDriverGetExtensionFunctionAddress, zeDriver,
+ "zexKernelGetArgumentSize", (void **)&zexKernelGetArgumentSize);
+ if (RC != ZE_RESULT_SUCCESS)
+ zexKernelGetArgumentSize = nullptr;
+
return Plugin::success();
}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 4db3c4e47f544..b6d5dad317c1d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -16,6 +16,7 @@
#include "L0Plugin.h"
#include "L0Program.h"
#include "L0Trace.h"
+#include "PluginInterface.h"
namespace llvm::omp::target::plugin {
@@ -158,6 +159,28 @@ std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
return Ordinal;
}
+/// Check if device supports cooperative kernels by checking if any command
+/// queue group has the cooperative kernels flag set.
+bool L0DeviceTy::checkCooperativeKernelSupport() {
+ uint32_t Count = 0;
+ const auto zeDevice = getZeDevice();
+ CALL_ZE_RET(false, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ nullptr);
+
+ std::vector<ze_command_queue_group_properties_t> Properties(
+ Count,
+ {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0});
+ CALL_ZE_RET(false, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ Properties.data());
+
+ for (auto &Property : Properties)
+ if (Property.flags &
+ ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS)
+ return true;
+
+ return false;
+}
+
void L0DeviceTy::reportDeviceInfo() const {
ODBG_OS(OLDT_Device, [&](llvm::raw_ostream &O) {
O << "Device " << DeviceId << " information\n"
@@ -216,6 +239,8 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
CopyOrdinal = findCopyOrdinal();
+ SupportsCooperativeKernels = checkCooperativeKernelSupport();
+
IsAsyncEnabled =
isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
if (auto Err = MemAllocator.initDevicePools(*this, Options))
@@ -629,6 +654,8 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
DeviceInfo::MEMORY_CLOCK_RATE);
Info.add("Memory Address Size", uint64_t{64u}, "bits",
DeviceInfo::ADDRESS_BITS);
+ Info.add("Cooperative launch support", SupportsCooperativeKernels, "",
+ DeviceInfo::COOPERATIVE_LAUNCH_SUPPORT);
return Info;
}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b608e6ffe7931..c5ff6385386d0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -54,14 +54,25 @@ Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
KernelPR.SIMDWidth = KP.maxSubgroupSize;
KernelPR.Width = KP.maxSubgroupSize;
-
+ KernelPR.NumKernelArgs = KP.numKernelArgs;
if (KP.pNext)
KernelPR.Width = KPrefGRPSize.preferredMultiple;
- if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
+ if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen))
KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
- }
+
KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
+
+ // Query and cache argument sizes if extension is available
+ auto &Context = l0Device.getL0Context();
+ if (KernelPR.NumKernelArgs > 0 && Context.zexKernelGetArgumentSize) {
+ KernelPR.ArgSizes = std::make_unique<uint32_t[]>(KernelPR.NumKernelArgs);
+ for (uint32_t I = 0; I < KernelPR.NumKernelArgs; I++) {
+ CALL_ZE_RET_ERROR(Context.zexKernelGetArgumentSize, zeKernel, I,
+ &KernelPR.ArgSizes[I]);
+ }
+ }
+
return Plugin::success();
}
@@ -282,8 +293,16 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
- CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
- &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+ if (KEnv.IsCooperative) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Launching cooperative kernel " DPxMOD "\n", DPxPTR(zeKernel));
+ CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+ zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
+ WaitEvents);
+ } else {
+ CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+ }
KEnv.KernelPR.Mtx.unlock();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
@@ -322,8 +341,15 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
"Using regular command list for kernel submission.\n");
ze_event_handle_t Event = nullptr;
- CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
- &KEnv.GroupCounts, Event, 0, nullptr);
+ if (KEnv.IsCooperative) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Launching cooperative kernel " DPxMOD "\n", DPxPTR(zeKernel));
+ CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+ zeKernel, &KEnv.GroupCounts, Event, 0, nullptr);
+ } else {
+ CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ &KEnv.GroupCounts, Event, 0, nullptr);
+ }
KEnv.KernelPR.Mtx.unlock();
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
@@ -421,7 +447,7 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
auto zeKernel = getZeKernel();
auto DeviceId = l0Device.getDeviceId();
- int32_t NumArgs = KernelArgs.NumArgs;
+
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
DPxPTR(zeKernel));
@@ -437,8 +463,8 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
auto *AsyncQueue =
IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
auto &KernelPR = getProperties();
-
- L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
+ bool IsCooperative = KernelArgs.Flags.Cooperative;
+ L0LaunchEnvTy KEnv(IsAsync, IsCooperative, AsyncQueue, KernelPR);
// Protect from kernel preparation to submission as kernels are shared.
KernelPR.Mtx.lock();
@@ -446,17 +472,50 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
return Err;
+ // Validate cooperative kernel launch constraints
+ if (IsCooperative) {
+ uint32_t MaxCooperativeGroupCount = 0;
+ CALL_ZE_RET_ERROR(zeKernelSuggestMaxCooperativeGroupCount, zeKernel,
+ &MaxCooperativeGroupCount);
+
+ uint32_t TotalGroupCount = KEnv.GroupCounts.groupCountX *
+ KEnv.GroupCounts.groupCountY *
+ KEnv.GroupCounts.groupCountZ;
+
+ if (TotalGroupCount > MaxCooperativeGroupCount) {
+ KernelPR.Mtx.unlock();
+ return Plugin::error(
+ ErrorCode::INVALID_ARGUMENT,
+ "cooperative kernel launch failed: requested %u groups exceeds "
+ "maximum %u cooperative groups supported by device",
+ TotalGroupCount, MaxCooperativeGroupCount);
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Cooperative kernel validated: using %u groups (max: %u)\n",
+ TotalGroupCount, MaxCooperativeGroupCount);
+ }
+
// Set kernel arguments.
- for (int32_t I = 0; I < NumArgs; I++) {
- // Scope code to ease integration with downstream custom code.
- {
- void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
- CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
- Arg == nullptr ? nullptr : &Arg);
+ uint32_t NumArgs = Properties.NumKernelArgs;
+ if (NumArgs > 0) {
+ if (!KernelArgs.ArgSizes && !Properties.ArgSizes)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "level zero plugin requires kernel argument sizes.");
+ // use user provided argument sizes if available, otherwise use sizes from
+ // kernel properties.
+ char *Arg = static_cast<char *>(LaunchParams.Data);
+ for (uint32_t I = 0; I < NumArgs; I++) {
+ uint32_t ArgSize = KernelArgs.ArgSizes
+ ? static_cast<uint32_t>(KernelArgs.ArgSizes[I])
+ : Properties.ArgSizes[I];
+ CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, ArgSize, Arg);
+
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
- "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+ "Kernel Pointer argument %" PRIu32 " (value: " DPxMOD
") was set successfully for device %s.\n",
I, DPxPTR(Arg), IdStr);
+ Arg += ArgSize;
}
}
@@ -472,4 +531,32 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
}
+Expected<uint32_t> L0KernelTy::getMaxCooperativeGroupCount(
+ GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+ const size_t *LocalWorkSize, size_t DynamicSharedMemorySize) const {
+ (void)DynamicSharedMemorySize;
+
+ // Set group size based on work dimensions
+ uint32_t GroupSize[3];
+ GroupSize[0] = static_cast<uint32_t>(LocalWorkSize[0]);
+ GroupSize[1] = WorkDim >= 2 ? static_cast<uint32_t>(LocalWorkSize[1]) : 1;
+ GroupSize[2] = WorkDim == 3 ? static_cast<uint32_t>(LocalWorkSize[2]) : 1;
+
+ ze_result_t Res =
+ zeKernelSetGroupSize(zeKernel, GroupSize[0], GroupSize[1], GroupSize[2]);
+ if (Res != ZE_RESULT_SUCCESS)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "failed to set group size for cooperative launch");
+
+ uint32_t MaxCooperativeGroupCount = 0;
+ Res = zeKernelSuggestMaxCooperativeGroupCount(zeKernel,
+ &MaxCooperativeGroupCount);
+
+ if (Res != ZE_RESULT_SUCCESS)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "failed to query max cooperative group count");
+
+ return MaxCooperativeGroupCount;
+}
+
} // namespace llvm::omp::target::plugin
diff --git a/offload/tools/offload-tblgen/PrintGen.cpp b/offload/tools/offload-tblgen/PrintGen.cpp
index 89d7c820426cf..d2475d0d43a01 100644
--- a/offload/tools/offload-tblgen/PrintGen.cpp
+++ b/offload/tools/offload-tblgen/PrintGen.cpp
@@ -74,9 +74,10 @@ inline void printTagged(llvm::raw_ostream &os, const void *ptr, {0} value, size_
if (Type == "char[]") {
OS << formatv(TAB_2 "printPtr(os, (const char*) ptr);\n");
} else {
- if (Type == "void *")
- OS << formatv(TAB_2 "void * const * const tptr = (void * "
- "const * const)ptr;\n");
+ if (Type.ends_with("*"))
+ OS << formatv(TAB_2 "const {0} const * tptr = (const {0} "
+ "const *)ptr;\n",
+ Type);
else
OS << formatv(
TAB_2 "const {0} * const tptr = (const {0} * const)ptr;\n", Type);
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index 6e6c2738db510..d9cc0926760b7 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -296,7 +296,7 @@ void DeviceContext::launchKernelImpl(
LaunchSizeArgs.DynSharedMemory = 0;
OL_CHECK(olLaunchKernel(nullptr, DeviceHandle, KernelHandle, KernelArgs,
- KernelArgsSize, &LaunchSizeArgs));
+ KernelArgsSize, &LaunchSizeArgs, NULL));
}
[[nodiscard]] llvm::StringRef DeviceContext::getName() const noexcept {
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 031dbea660fbc..6a9cc267672e3 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -22,7 +22,8 @@ target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
add_offload_unittest("kernel"
kernel/olCalculateOptimalOccupancy.cpp
- kernel/olLaunchKernel.cpp)
+ kernel/olLaunchKernel.cpp
+ kernel/olLaunchKernelCooperative.cpp)
add_offload_unittest("memory"
memory/olMemAlloc.cpp
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 6f9961e2c6d58..9f91220b614aa 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -254,6 +254,43 @@ struct OffloadEventTest : OffloadQueueTest {
ol_event_handle_t Event = nullptr;
};
+struct LaunchKernelTestBase : OffloadQueueTest {
+ void SetUpProgram(const char *program) {
+ RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
+ ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+ ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+ DeviceBin->getBufferSize(), &Program));
+
+ LaunchArgs.Dimensions = 1;
+ LaunchArgs.GroupSize = {64, 1, 1};
+ LaunchArgs.NumGroups = {1, 1, 1};
+
+ LaunchArgs.DynSharedMemory = 0;
+ }
+
+ void TearDown() override {
+ if (Program) {
+ olDestroyProgram(Program);
+ }
+ RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+ }
+
+ std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+ ol_program_handle_t Program = nullptr;
+ ol_kernel_launch_size_args_t LaunchArgs{};
+};
+
+struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
+ void SetUpKernel(const char *kernel) {
+ RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
+ ASSERT_SUCCESS(
+ olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
+ }
+
+ ol_symbol_handle_t Kernel = nullptr;
+};
+
// Devices might not be available for offload testing, so allow uninstantiated
// tests (as the device list will be empty). This means that all tests requiring
// a device will be silently skipped.
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 166b8dabff0d8..2f444e48330f8 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -10,43 +10,6 @@
#include <OffloadAPI.h>
#include <gtest/gtest.h>
-struct LaunchKernelTestBase : OffloadQueueTest {
- void SetUpProgram(const char *program) {
- RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
- ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
- ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
- ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
- DeviceBin->getBufferSize(), &Program));
-
- LaunchArgs.Dimensions = 1;
- LaunchArgs.GroupSize = {64, 1, 1};
- LaunchArgs.NumGroups = {1, 1, 1};
-
- LaunchArgs.DynSharedMemory = 0;
- }
-
- void TearDown() override {
- if (Program) {
- olDestroyProgram(Program);
- }
- RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
- }
-
- std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
- ol_program_handle_t Program = nullptr;
- ol_kernel_launch_size_args_t LaunchArgs{};
-};
-
-struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
- void SetUpKernel(const char *kernel) {
- RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
- ASSERT_SUCCESS(
- olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
- }
-
- ol_symbol_handle_t Kernel = nullptr;
-};
-
#define KERNEL_TEST(NAME, KERNEL) \
struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase { \
void SetUp() override { SetUpKernel(#KERNEL); } \
@@ -93,8 +56,8 @@ TEST_P(olLaunchKernelFooTest, Success) {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
@@ -116,7 +79,7 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
} Args{Mem};
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
- &LaunchArgs));
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
@@ -131,7 +94,7 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
}
@@ -143,9 +106,8 @@ TEST_P(olLaunchKernelMultiArgsTest, Success) {
short C;
} Args{0, nullptr, 0};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
-
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
}
@@ -159,7 +121,7 @@ TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
} Args{Mem};
ASSERT_SUCCESS(olLaunchKernel(nullptr, Device, Kernel, &Args, sizeof(Args),
- &LaunchArgs));
+ &LaunchArgs, NULL));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
@@ -182,8 +144,8 @@ TEST_P(olLaunchKernelLocalMemTest, Success) {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
@@ -205,8 +167,8 @@ TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
@@ -228,8 +190,8 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
@@ -249,10 +211,10 @@ TEST_P(olLaunchKernelGlobalTest, Success) {
} Args{Mem};
ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0, &LaunchArgs));
+ olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0, &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
- &LaunchArgs));
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
@@ -267,8 +229,9 @@ TEST_P(olLaunchKernelGlobalTest, InvalidNotAKernel) {
ol_symbol_handle_t Global = nullptr;
ASSERT_SUCCESS(
olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
- ASSERT_ERROR(OL_ERRC_SYMBOL_KIND,
- olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs));
+ ASSERT_ERROR(
+ OL_ERRC_SYMBOL_KIND,
+ olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs, NULL));
}
TEST_P(olLaunchKernelGlobalCtorTest, Success) {
@@ -279,8 +242,8 @@ TEST_P(olLaunchKernelGlobalCtorTest, Success) {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
@@ -296,6 +259,6 @@ TEST_P(olLaunchKernelGlobalDtorTest, Success) {
// find/implement a way, update this test. For now we just check that nothing
// crashes
ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp
new file mode 100644
index 0000000000000..3abe3f916693f
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp
@@ -0,0 +1,120 @@
+//===------- Offload API tests - olLaunchKernel Cooperative ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+struct olLaunchKernelCooperativeTest : LaunchSingleKernelTestBase {
+ void SetUp() override {
+ SetUpKernel("noargs");
+
+ bool SupportsCooperative = false;
+ auto Result =
+ olGetDeviceInfo(Device, OL_DEVICE_INFO_COOPERATIVE_LAUNCH_SUPPORT,
+ sizeof(bool), &SupportsCooperative);
+
+ if (Result) {
+ if (Result->Code == OL_ERRC_UNIMPLEMENTED) {
+ GTEST_SKIP()
+ << "Device does not provide cooperative launch support information";
+ }
+ if (Result->Code != OL_ERRC_SUCCESS) {
+ GTEST_FAIL() << "olGetDeviceInfo failed with unexpected error: "
+ << Result->Code << ": " << Result->Details;
+ }
+ }
+
+ if (!SupportsCooperative) {
+ GTEST_SKIP() << "Device does not support cooperative kernel launch";
+ }
+ }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelCooperativeTest);
+
+TEST_P(olLaunchKernelCooperativeTest, GetMaxCooperativeGroupCount) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+ Device, Kernel, &LaunchArgs, &MaxGroupCount));
+ ASSERT_GT(MaxGroupCount, 0u);
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SuccessCooperative) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+ Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+ LaunchArgs.NumGroups.x = MaxGroupCount;
+
+ bool IsCooperative = true;
+ ol_kernel_launch_prop_t Props[] = {
+ {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+ OL_KERNEL_LAUNCH_PROP_END};
+
+ ASSERT_SUCCESS(
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SuccessNonCooperative) {
+ ASSERT_SUCCESS(
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, TooManyGroups) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+ Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+ LaunchArgs.NumGroups.x = MaxGroupCount * 2;
+
+ bool IsCooperative = true;
+ ol_kernel_launch_prop_t Props[] = {
+ {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+ OL_KERNEL_LAUNCH_PROP_END};
+
+ ASSERT_ANY_ERROR(
+ olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SynchronousLaunch) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+ Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+ LaunchArgs.NumGroups.x = std::min(MaxGroupCount, 2u);
+
+ bool IsCooperative = true;
+ ol_kernel_launch_prop_t Props[] = {
+ {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+ OL_KERNEL_LAUNCH_PROP_END};
+
+ ASSERT_SUCCESS(
+ olLaunchKernel(nullptr, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullHandleKernel) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olGetKernelMaxCooperativeGroupCount(Device, nullptr, &LaunchArgs,
+ &MaxGroupCount));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullHandleDevice) {
+ uint32_t MaxGroupCount = 0;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olGetKernelMaxCooperativeGroupCount(nullptr, Kernel, &LaunchArgs,
+ &MaxGroupCount));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullPointerGroupCountRet) {
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+ olGetKernelMaxCooperativeGroupCount(Device, Kernel, &LaunchArgs,
+ nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
index cc67d782ef403..d9745c87e3496 100644
--- a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
@@ -171,7 +171,7 @@ TEST_P(olMemcpyGlobalTest, SuccessWrite) {
olMemcpy(Queue, Addr, Device, SourceMem, Host, 64 * sizeof(uint32_t)));
ASSERT_SUCCESS(olSyncQueue(Queue));
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, ReadKernel, &Args, sizeof(Args),
- &LaunchArgs));
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *DestData = (uint32_t *)DestMem;
@@ -188,8 +188,8 @@ TEST_P(olMemcpyGlobalTest, SuccessRead) {
LaunchArgs.GroupSize.x * sizeof(uint32_t),
&DestMem));
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, WriteKernel, nullptr, 0, &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, WriteKernel, nullptr, 0,
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olSyncQueue(Queue));
ASSERT_SUCCESS(
olMemcpy(Queue, DestMem, Host, Addr, Device, 64 * sizeof(uint32_t)));
diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
index aa86750f6adf9..e0b3d1c280ac2 100644
--- a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
+++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
@@ -77,8 +77,8 @@ TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) {
struct {
void *Mem;
} Args{Mem};
- ASSERT_SUCCESS(
- olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, NULL));
std::this_thread::sleep_for(std::chrono::milliseconds(500));
for (uint32_t i = 0; i < 64; i++) {
diff --git a/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp b/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
index 9838562752cc4..9011f17eff5e9 100644
--- a/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
+++ b/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
@@ -52,7 +52,7 @@ TEST_P(olWaitEventsTest, Success) {
ASSERT_SUCCESS(olWaitEvents(Queues[I], &Events[I - 1], 1));
ASSERT_SUCCESS(olLaunchKernel(Queues[I], Device, Kernel, &Args,
- sizeof(Args), &LaunchArgs));
+ sizeof(Args), &LaunchArgs, NULL));
ASSERT_SUCCESS(olCreateEvent(Queues[I], &Events[I]));
}
@@ -86,7 +86,7 @@ TEST_P(olWaitEventsTest, SuccessSingleQueue) {
ASSERT_SUCCESS(olWaitEvents(Queue, &Events[I - 1], 1));
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
- &LaunchArgs));
+ &LaunchArgs, NULL));
ASSERT_SUCCESS(olCreateEvent(Queue, &Events[I]));
}
@@ -120,7 +120,7 @@ TEST_P(olWaitEventsTest, SuccessMultipleEvents) {
ASSERT_SUCCESS(olWaitEvents(Queues[I], Events, I));
ASSERT_SUCCESS(olLaunchKernel(Queues[I], Device, Kernel, &Args,
- sizeof(Args), &LaunchArgs));
+ sizeof(Args), &LaunchArgs, NULL));
ASSERT_SUCCESS(olCreateEvent(Queues[I], &Events[I]));
}
More information about the llvm-commits
mailing list