[llvm] [offload] Add properties parameter to olLaunchKernel (PR #184343)

Fri Mar 6 03:42:34 PST 2026

https://github.com/lplewa updated https://github.com/llvm/llvm-project/pull/184343

>From 41d3cb2ea25d60f9267d264ca00ce49329ea1892 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Plewa?= <lukasz.plewa at intel.com>
Date: Thu, 18 Dec 2025 14:28:10 +0100
Subject: [PATCH] [offload] Add properties parameter to olLaunchKernel

Introduce a properties argument to olLaunchKernel to enable future
extensions.

This change adds initial support for:
- cooperative kernel launch
- extended kernel argument size (required by the L0 plugin)
---
 offload/include/Shared/APITypes.h             |   9 +-
 offload/liboffload/API/Device.td              |   9 +-
 offload/liboffload/API/Kernel.td              |  73 +++++++++--
 offload/liboffload/src/OffloadImpl.cpp        |  61 ++++++++-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |   5 +
 .../common/include/PluginInterface.h          |   9 ++
 .../cuda/dynamic_cuda/cuda.cpp                |   3 +-
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  48 ++++++-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  80 ++++++++++--
 offload/plugins-nextgen/host/src/rtl.cpp      |   6 +
 .../level_zero/include/L0Context.h            |   5 +
 .../level_zero/include/L0Device.h             |   8 ++
 .../level_zero/include/L0Kernel.h             |  13 +-
 .../level_zero/src/L0Context.cpp              |   7 +
 .../level_zero/src/L0Device.cpp               |  27 ++++
 .../level_zero/src/L0Kernel.cpp               | 121 +++++++++++++++---
 offload/tools/offload-tblgen/PrintGen.cpp     |   7 +-
 .../Conformance/lib/DeviceContext.cpp         |   2 +-
 offload/unittests/OffloadAPI/CMakeLists.txt   |   3 +-
 .../unittests/OffloadAPI/common/Fixtures.hpp  |  37 ++++++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      |  79 +++---------
 .../kernel/olLaunchKernelCooperative.cpp      | 120 +++++++++++++++++
 .../unittests/OffloadAPI/memory/olMemcpy.cpp  |   6 +-
 .../OffloadAPI/queue/olLaunchHostFunction.cpp |   4 +-
 .../OffloadAPI/queue/olWaitEvents.cpp         |   6 +-
 25 files changed, 630 insertions(+), 118 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..9213f2924f1f7 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -100,10 +100,11 @@ struct KernelArgsTy {
   uint64_t Tripcount =
       0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
-    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
-    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
-    uint64_t Unused : 62;
-  } Flags = {0, 0, 0};
+    uint64_t NoWait : 1;      // Was this kernel spawned with a `nowait` clause.
+    uint64_t IsCUDA : 1;      // Was this kernel spawned via CUDA.
+    uint64_t Cooperative : 1; // Was this kernel spawned as cooperative.
+    uint64_t Unused : 61;
+  } Flags = {0, 0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
   // The number of threads (for x,y,z dimension).
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 6ada191089674..7790386ae02e1 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -47,7 +47,14 @@ def ol_device_info_t : Enum {
   ];
   list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
   list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
-  let etors = !listconcat(basic_etors, fp_configs, native_vec_widths);
+  // This list is maintained separately to allow adding new basic etors without
+  // changing the values of previous ones.
+  list<TaggedEtor> basic_etors2 =
+      [TaggedEtor<"COOPERATIVE_LAUNCH_SUPPORT", "bool",
+                  "Is cooperative kernel launch supported">,
+  ];
+  let etors =
+      !listconcat(basic_etors, fp_configs, native_vec_widths, basic_etors2);
 }
 
 def ol_device_fp_capability_flag_t : Enum {
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 2f5692a19d712..1e713bed1e903 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -20,20 +20,52 @@ def ol_kernel_launch_size_args_t : Struct {
     ];
 }
 
+def OL_KERNEL_LAUNCH_PROP_END : Macro {
+  let desc = "last element of the ol_kernel_launch_prop_t array";
+  let value = "{OL_KERNEL_LAUNCH_PROP_TYPE_NONE, NULL}";
+}
+
+def ol_kernel_launch_prop_type_t : Enum {
+  let desc = "Defines structure type";
+  let is_typed = 1;
+  let etors =
+      [TaggedEtor<"none", "void *", "Used for null terminating property array">,
+       TaggedEtor<"size", "size_t *", "Array of the arguments sizes.">,
+       TaggedEtor<"is_cooperative ", "bool *", "Cooperative kernel launch">];
+}
+
+def ol_kernel_launch_prop_t : Struct {
+  let desc = "Optional properties for kernel launch.";
+  let members = [StructMember<"ol_kernel_launch_prop_type_t", "type",
+                              "Type of the data field">,
+                 StructMember<"void *", "data",
+                              "Pointer to property-specific data.">];
+}
+
 def olLaunchKernel : Function {
     let desc = "Enqueue a kernel launch with the specified size and parameters.";
     let details = [
         "If a queue is not specified, kernel execution happens synchronously",
         "ArgumentsData may be set to NULL (to indicate no parameters)"
     ];
-    let params = [
-        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
-        Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
-        Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
-        Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
-        Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
-    ];
+    let params =
+        [Param<"ol_queue_handle_t", "Queue", "handle of the queue",
+               PARAM_IN_OPTIONAL>,
+         Param<"ol_device_handle_t", "Device",
+               "handle of the device to execute on", PARAM_IN>,
+         Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel",
+               PARAM_IN>,
+         Param<"const void*", "ArgumentsData",
+               "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
+         Param<"size_t", "ArgumentsSize", "size of the kernel argument struct",
+               PARAM_IN>,
+         Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs",
+               "pointer to the struct containing launch size parameters",
+               PARAM_IN>,
+         Param<"const ol_kernel_launch_prop_t *", "Properties",
+               "Array of optional properties, last element must be "
+               "OL_KERNEL_LAUNCH_PROP_END",
+               PARAM_IN_OPTIONAL>];
     let returns = [
         Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
         Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
@@ -57,3 +89,28 @@ def olCalculateOptimalOccupancy : Function {
         Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
     ];
 }
+
+def olGetKernelMaxCooperativeGroupCount : Function {
+  let desc = "Query the maximum number of work groups that can be launched "
+             "cooperatively for a kernel.";
+  let details =
+      ["This function returns the maximum number of work groups that can "
+       "participate in a cooperative launch for the given kernel.",
+       "The maximum count depends on the work group size and dynamic shared "
+       "memory usage.",
+  ];
+  let params = [Param<"ol_device_handle_t", "Device",
+                      "device intended to run the kernel", PARAM_IN>,
+                Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel",
+                      PARAM_IN>,
+                Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs",
+                      "launch size parameters", PARAM_IN>,
+                Param<"uint32_t*", "MaxGroupCount",
+                      "maximum number of cooperative groups", PARAM_OUT>];
+  let returns =
+      [Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+       Return<
+           "OL_ERRC_UNSUPPORTED", ["Cooperative launch is not supported or "
+                                   "backend cannot provide this information"]>,
+  ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index dd3ec0f61b4da..7b5bbb6088c4b 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -496,7 +496,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                        "plugin returned incorrect type");
     return Info.writeString(std::get<std::string>(Entry->Value).c_str());
   }
-
+  case OL_DEVICE_INFO_COOPERATIVE_LAUNCH_SUPPORT: {
+    // Bool value
+    if (!std::holds_alternative<bool>(Entry->Value))
+      return makeError(ErrorCode::BACKEND_FAILURE,
+                       "plugin returned incorrect type");
+    return Info.write(static_cast<uint8_t>(std::get<bool>(Entry->Value)));
+  }
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
   case OL_DEVICE_INFO_MAX_WORK_SIZE:
   case OL_DEVICE_INFO_VENDOR_ID:
@@ -1032,10 +1038,39 @@ Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
   return Error::success();
 }
 
+Error olGetKernelMaxCooperativeGroupCount_impl(
+    ol_device_handle_t Device, ol_symbol_handle_t Kernel,
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+    uint32_t *MaxGroupCount) {
+  if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
+    return createOffloadError(ErrorCode::SYMBOL_KIND,
+                              "provided symbol is not a kernel");
+
+  GenericDeviceTy *DeviceImpl = Device->Device;
+  auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
+
+  // Extract work group size from LaunchSizeArgs
+  size_t LocalWorkSize[3];
+  LocalWorkSize[0] = LaunchSizeArgs->GroupSize.x;
+  LocalWorkSize[1] = LaunchSizeArgs->GroupSize.y;
+  LocalWorkSize[2] = LaunchSizeArgs->GroupSize.z;
+
+  auto Res = KernelImpl->getMaxCooperativeGroupCount(
+      *DeviceImpl, LaunchSizeArgs->Dimensions, LocalWorkSize,
+      LaunchSizeArgs->DynSharedMemory);
+  if (auto Err = Res.takeError())
+    return Err;
+
+  *MaxGroupCount = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_symbol_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
-                          const ol_kernel_launch_size_args_t *LaunchSizeArgs) {
+                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                          const ol_kernel_launch_prop_t *Properties) {
   auto *DeviceImpl = Device->Device;
   if (Queue && Device != Queue->Device) {
     return createOffloadError(
@@ -1048,7 +1083,6 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                               "provided symbol is not a kernel");
 
   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
-  AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroups.x;
   LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroups.y;
@@ -1058,6 +1092,27 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
+  while (Properties && Properties->type != OL_KERNEL_LAUNCH_PROP_TYPE_NONE) {
+    switch (Properties->type) {
+    case OL_KERNEL_LAUNCH_PROP_TYPE_SIZE:
+      // This API requests size_t, while KernelArgsTy uses int64_t.
+      // TODO: We might want to change this struct to also use const size_t
+      LaunchArgs.ArgSizes = const_cast<int64_t *>(
+          reinterpret_cast<const int64_t *>(Properties->data));
+      break;
+    case OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE:
+      LaunchArgs.Flags.Cooperative =
+          *reinterpret_cast<const bool *>(Properties->data);
+      break;
+    default:
+      return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+                                "olLaunchKernel property enum '%i' is invalid",
+                                Properties->type);
+    }
+    Properties++;
+  }
+
+  AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
   KernelLaunchParamsTy Params;
   Params.Data = const_cast<void *>(ArgumentsData);
   Params.Size = ArgumentsSize;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 379c8ec11225d..fb413f2363821 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3737,6 +3737,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  // Cooperative kernel launch is not yet supported for AMDGPU
+  if (KernelArgs.Flags.Cooperative)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "cooperative kernel launch not supported for AMDGPU");
+
   AMDGPUPluginTy &AMDGPUPlugin =
       static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
   AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 1c59ed1eda841..fcd08aeff192e 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -370,6 +370,15 @@ struct GenericKernelTy {
   virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
                                           uint64_t DynamicMemSize) const = 0;
 
+  /// Get the maximum number of work groups that can be launched cooperatively.
+  virtual Expected<uint32_t>
+  getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+                              const size_t *LocalWorkSize,
+                              size_t DynamicSharedMemorySize) const {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "cooperative launch not supported");
+  }
+
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 80e3e418ae3fa..d0916789d0f57 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -42,7 +42,7 @@ DLWRAP(cuDeviceTotalMem, 2)
 DLWRAP(cuDriverGetVersion, 1)
 
 DLWRAP(cuGetErrorString, 2)
-DLWRAP(cuLaunchKernel, 11)
+DLWRAP(cuLaunchKernelEx, 4)
 DLWRAP(cuLaunchHostFunc, 3)
 
 DLWRAP(cuMemAlloc, 2)
@@ -83,6 +83,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
 DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
+DLWRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor, 4)
 DLWRAP(cuFuncGetParamInfo, 4)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 7e42c66dddabb..eb149be05777d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -295,6 +295,48 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
 typedef size_t (*CUoccupancyB2DSize)(int);
 
+typedef enum CUlaunchAttributeID_enum {
+  CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+  CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1,
+  CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2,
+  CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+  CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4,
+  CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+  CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6,
+  CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7,
+  CU_LAUNCH_ATTRIBUTE_PRIORITY = 8,
+  CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9,
+  CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10,
+  CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
+  CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
+  CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
+  CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14
+} CUlaunchAttributeID;
+
+typedef union CUlaunchAttributeValue_union {
+  char pad[64];
+  int cooperative;
+} CUlaunchAttributeValue;
+
+typedef struct CUlaunchAttribute_st {
+  CUlaunchAttributeID id;
+  char pad[8 - sizeof(CUlaunchAttributeID)];
+  CUlaunchAttributeValue value;
+} CUlaunchAttribute;
+
+typedef struct CUlaunchConfig_st {
+  unsigned int gridDimX;
+  unsigned int gridDimY;
+  unsigned int gridDimZ;
+  unsigned int blockDimX;
+  unsigned int blockDimY;
+  unsigned int blockDimZ;
+  unsigned int sharedMemBytes;
+  CUstream hStream;
+  CUlaunchAttribute *attrs;
+  unsigned int numAttrs;
+} CUlaunchConfig;
+
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -310,9 +352,7 @@ CUresult cuDriverGetVersion(int *);
 
 CUresult cuGetErrorString(CUresult, const char **);
 CUresult cuInit(unsigned);
-CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
-                        unsigned, unsigned, unsigned, CUstream, void **,
-                        void **);
+CUresult cuLaunchKernelEx(const CUlaunchConfig *, CUfunction, void **, void **);
 CUresult cuLaunchHostFunc(CUstream, CUhostFn, void *);
 
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
@@ -390,6 +430,8 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        CUmemAllocationGranularity_flags option);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
                                           CUoccupancyB2DSize, size_t, int);
+CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int *, CUfunction, int,
+                                                     size_t);
 CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index d5ab0b3309c86..d0b56ee364f56 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -174,6 +174,12 @@ struct CUDAKernelTy : public GenericKernelTy {
     return MaxBlockSize;
   }
 
+  /// Get maximum cooperative group count
+  Expected<uint32_t>
+  getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+                              const size_t *LocalWorkSize,
+                              size_t DynamicSharedMemorySize) const override;
+
 private:
   /// Initialize the size of the arguments.
   Error initArgsSize() {
@@ -1258,7 +1264,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add("Cooperative Launch", (bool)TmpInt);
+      Info.add("Cooperative Launch", bool(TmpInt), "",
+               DeviceInfo::COOPERATIVE_LAUNCH_SUPPORT);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1495,9 +1502,17 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
     MaxDynCGroupMemLimit = MaxDynCGroupMem;
   }
 
-  CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
-                                NumThreads[0], NumThreads[1], NumThreads[2],
-                                MaxDynCGroupMem, Stream, nullptr, Config);
+  CUlaunchAttribute CoopAttr;
+  CoopAttr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+  CoopAttr.value.cooperative = KernelArgs.Flags.Cooperative;
+
+  CUlaunchConfig LaunchConfig = {NumBlocks[0],    NumBlocks[1],
+                                 NumBlocks[2],    NumThreads[0],
+                                 NumThreads[1],   NumThreads[2],
+                                 MaxDynCGroupMem, Stream,
+                                 &CoopAttr,       1};
+
+  CUresult Res = cuLaunchKernelEx(&LaunchConfig, Func, nullptr, Config);
 
   // Register a callback to indicate when the kernel is complete.
   if (GenericDevice.getRPCServer())
@@ -1509,15 +1524,64 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
         },
         &GenericDevice.Plugin);
 
-  return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
+  return Plugin::check(Res, "error in cuLaunchKernelEx for '%s': %s",
+                       getName());
+}
+
+Expected<uint32_t> CUDAKernelTy::getMaxCooperativeGroupCount(
+    GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+    const size_t *LocalWorkSize, size_t DynamicSharedMemorySize) const {
+  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+
+  uint32_t SupportsCooperative = 0;
+  if (auto Err = CUDADevice.getDeviceAttr(
+          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, SupportsCooperative))
+    return Err;
+
+  if (!SupportsCooperative)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "device does not support cooperative launch");
+
+  // Calculate total local work size
+  size_t LocalWorkSizeTotal = LocalWorkSize[0];
+  LocalWorkSizeTotal *= (WorkDim >= 2 ? LocalWorkSize[1] : 1);
+  LocalWorkSizeTotal *= (WorkDim == 3 ? LocalWorkSize[2] : 1);
+
+  // Query max active blocks per multiprocessor
+  int32_t MaxNumActiveGroupsPerCU = 0;
+  CUresult Res = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &MaxNumActiveGroupsPerCU, Func, LocalWorkSizeTotal,
+      DynamicSharedMemorySize);
+  if (auto Err = Plugin::check(
+          Res, "error in cuOccupancyMaxActiveBlocksPerMultiprocessor: %s"))
+    return Err;
+
+  assert(MaxNumActiveGroupsPerCU >= 0);
+
+  // Handle the case where we can't have all SMs active with at least 1 group
+  // per SM. In that case, the device is still able to run 1 work-group, hence
+  // we will manually check if it is possible with the available HW resources.
+  if (MaxNumActiveGroupsPerCU == 0)
+    // Check if we can launch at least 1 work-group
+    return (LocalWorkSizeTotal <= MaxNumThreads &&
+            DynamicSharedMemorySize <= CUDADevice.getMaxBlockSharedMemSize());
+
+  // Multiply by the number of multiprocessors (compute units) on the device
+  uint32_t NumMultiprocessors = 0;
+  if (auto Err = CUDADevice.getDeviceAttr(
+          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, NumMultiprocessors))
+    return Err;
+
+  return NumMultiprocessors * MaxNumActiveGroupsPerCU;
 }
 
-/// Class implementing the CUDA-specific functionalities of the global handler.
+/// Class implementing the CUDA-specific functionalities of the global
+/// handler.
 class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
 public:
   /// Get the metadata of a global from the device. The name and size of the
-  /// global is read from DeviceGlobal and the address of the global is written
-  /// to DeviceGlobal.
+  /// global is read from DeviceGlobal and the address of the global is
+  /// written to DeviceGlobal.
   Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
                                     DeviceImageTy &Image,
                                     GlobalTy &DeviceGlobal) override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index df2b6f2c1dba9..91dabfbabd0d5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -103,6 +103,12 @@ struct GenELF64KernelTy : public GenericKernelTy {
     if (!SupportsFFI)
       return Plugin::error(ErrorCode::UNSUPPORTED,
                            "libffi is not available, cannot launch kernel");
+
+    // Cooperative kernel launch is not supported for host
+    if (KernelArgs.Flags.Cooperative)
+      return Plugin::error(ErrorCode::UNSUPPORTED,
+                           "cooperative kernel launch not supported for host");
+
     // Create a vector of ffi_types, one per argument.
     SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
     ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 57560f4fd0824..14cb6da9c8da6 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -135,6 +135,11 @@ class L0ContextTy {
 
   const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
   MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+
+  /// Level Zero extension function pointer for kernel argument size query.
+  ze_result_t(ZE_APICALL *zexKernelGetArgumentSize)(
+      ze_kernel_handle_t hKernel, uint32_t argIndex,
+      uint32_t *pArgSize) = nullptr;
 };
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 001a41ba77d7b..f6f19284655eb 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -217,6 +217,9 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   bool IsAsyncEnabled = false;
 
+  /// Whether the device supports cooperative kernels.
+  bool SupportsCooperativeKernels = false;
+
   /// Lock for this device.
   std::mutex Mutex;
 
@@ -238,6 +241,9 @@ class L0DeviceTy final : public GenericDeviceTy {
   /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair.
   std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
 
+  /// Check if device supports cooperative kernels.
+  bool checkCooperativeKernelSupport();
+
 public:
   L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
              ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
@@ -269,6 +275,8 @@ class L0DeviceTy final : public GenericDeviceTy {
   Error deinitImpl() override;
   ze_device_handle_t getZeDevice() const { return zeDevice; }
 
+  bool supportsCooperativeKernels() const { return SupportsCooperativeKernels; }
+
   const L0ContextTy &getL0Context() const { return l0Context; }
   L0ContextTy &getL0Context() { return l0Context; }
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 1d5a014d9d0a5..c1737655a44db 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -61,6 +61,8 @@ struct KernelPropertiesTy {
   /// Cached input parameters used in the previous launch.
   int32_t NumTeams = -1;
   int32_t ThreadLimit = -1;
+  uint32_t NumKernelArgs = 0;
+  std::unique_ptr<uint32_t[]> ArgSizes;
 
   /// Cached parameters used in the previous launch.
   ze_kernel_indirect_access_flags_t IndirectAccessFlags =
@@ -81,15 +83,17 @@ struct KernelPropertiesTy {
 
 struct L0LaunchEnvTy {
   bool IsAsync;
+  bool IsCooperative = false;
   AsyncQueueTy *AsyncQueue;
   ze_group_count_t GroupCounts = {0, 0, 0};
   KernelPropertiesTy &KernelPR;
   bool HalfNumThreads = false;
   bool IsTeamsNDRange = false;
 
-  L0LaunchEnvTy(bool IsAsync, AsyncQueueTy *AsyncQueue,
+  L0LaunchEnvTy(bool IsAsync, bool IsCooperative, AsyncQueueTy *AsyncQueue,
                 KernelPropertiesTy &KernelPR)
-      : IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR) {}
+      : IsAsync(IsAsync), IsCooperative(IsCooperative), AsyncQueue(AsyncQueue),
+        KernelPR(KernelPR) {}
 };
 
 class L0KernelTy : public GenericKernelTy {
@@ -138,6 +142,11 @@ class L0KernelTy : public GenericKernelTy {
                          "maxGroupSize not implemented yet");
   }
 
+  Expected<uint32_t>
+  getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+                              const size_t *LocalWorkSize,
+                              size_t DynamicSharedMemorySize) const override;
+
   ze_kernel_handle_t getZeKernel() const { return zeKernel; }
 
   Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
index 3d1588bc5fecc..158da8e7136f0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Context.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -26,6 +26,13 @@ Error L0ContextTy::init() {
     return Err;
   if (auto Err = HostMemAllocator.initHostPool(*this, Plugin.getOptions()))
     return Err;
+
+  ze_result_t RC;
+  CALL_ZE(RC, zeDriverGetExtensionFunctionAddress, zeDriver,
+          "zexKernelGetArgumentSize", (void **)&zexKernelGetArgumentSize);
+  if (RC != ZE_RESULT_SUCCESS)
+    zexKernelGetArgumentSize = nullptr;
+
   return Plugin::success();
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 4db3c4e47f544..b6d5dad317c1d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -16,6 +16,7 @@
 #include "L0Plugin.h"
 #include "L0Program.h"
 #include "L0Trace.h"
+#include "PluginInterface.h"
 
 namespace llvm::omp::target::plugin {
 
@@ -158,6 +159,28 @@ std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
   return Ordinal;
 }
 
+/// Check if device supports cooperative kernels by checking if any command
+/// queue group has the cooperative kernels flag set.
+bool L0DeviceTy::checkCooperativeKernelSupport() {
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(false, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+
+  std::vector<ze_command_queue_group_properties_t> Properties(
+      Count,
+      {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0});
+  CALL_ZE_RET(false, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+
+  for (auto &Property : Properties)
+    if (Property.flags &
+        ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS)
+      return true;
+
+  return false;
+}
+
 void L0DeviceTy::reportDeviceInfo() const {
   ODBG_OS(OLDT_Device, [&](llvm::raw_ostream &O) {
     O << "Device " << DeviceId << " information\n"
@@ -216,6 +239,8 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
 
   CopyOrdinal = findCopyOrdinal();
 
+  SupportsCooperativeKernels = checkCooperativeKernelSupport();
+
   IsAsyncEnabled =
       isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
   if (auto Err = MemAllocator.initDevicePools(*this, Options))
@@ -629,6 +654,8 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
            DeviceInfo::MEMORY_CLOCK_RATE);
   Info.add("Memory Address Size", uint64_t{64u}, "bits",
            DeviceInfo::ADDRESS_BITS);
+  Info.add("Cooperative launch support", SupportsCooperativeKernels, "",
+           DeviceInfo::COOPERATIVE_LAUNCH_SUPPORT);
   return Info;
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b608e6ffe7931..c5ff6385386d0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -54,14 +54,25 @@ Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
   CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
   KernelPR.SIMDWidth = KP.maxSubgroupSize;
   KernelPR.Width = KP.maxSubgroupSize;
-
+  KernelPR.NumKernelArgs = KP.numKernelArgs;
   if (KP.pNext)
     KernelPR.Width = KPrefGRPSize.preferredMultiple;
 
-  if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
+  if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen))
     KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
-  }
+
   KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
+
+  // Query and cache argument sizes if extension is available
+  auto &Context = l0Device.getL0Context();
+  if (KernelPR.NumKernelArgs > 0 && Context.zexKernelGetArgumentSize) {
+    KernelPR.ArgSizes = std::make_unique<uint32_t[]>(KernelPR.NumKernelArgs);
+    for (uint32_t I = 0; I < KernelPR.NumKernelArgs; I++) {
+      CALL_ZE_RET_ERROR(Context.zexKernelGetArgumentSize, zeKernel, I,
+                        &KernelPR.ArgSizes[I]);
+    }
+  }
+
   return Plugin::success();
 }
 
@@ -282,8 +293,16 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Kernel depends on %zu data copying events.\n", NumWaitEvents);
-  CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                    &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+  if (KEnv.IsCooperative) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Launching cooperative kernel " DPxMOD "\n", DPxPTR(zeKernel));
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                      zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
+                      WaitEvents);
+  } else {
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                      &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+  }
   KEnv.KernelPR.Mtx.unlock();
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
@@ -322,8 +341,15 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
        "Using regular command list for kernel submission.\n");
 
   ze_event_handle_t Event = nullptr;
-  CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                    &KEnv.GroupCounts, Event, 0, nullptr);
+  if (KEnv.IsCooperative) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Launching cooperative kernel " DPxMOD "\n", DPxPTR(zeKernel));
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                      zeKernel, &KEnv.GroupCounts, Event, 0, nullptr);
+  } else {
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                      &KEnv.GroupCounts, Event, 0, nullptr);
+  }
   KEnv.KernelPR.Mtx.unlock();
   CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
   CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
@@ -421,7 +447,7 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   auto zeKernel = getZeKernel();
   auto DeviceId = l0Device.getDeviceId();
-  int32_t NumArgs = KernelArgs.NumArgs;
+
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
        DPxPTR(zeKernel));
 
@@ -437,8 +463,8 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   auto *AsyncQueue =
       IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
   auto &KernelPR = getProperties();
-
-  L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
+  bool IsCooperative = KernelArgs.Flags.Cooperative;
+  L0LaunchEnvTy KEnv(IsAsync, IsCooperative, AsyncQueue, KernelPR);
 
   // Protect from kernel preparation to submission as kernels are shared.
   KernelPR.Mtx.lock();
@@ -446,17 +472,50 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
     return Err;
 
+  // Validate cooperative kernel launch constraints
+  if (IsCooperative) {
+    uint32_t MaxCooperativeGroupCount = 0;
+    CALL_ZE_RET_ERROR(zeKernelSuggestMaxCooperativeGroupCount, zeKernel,
+                      &MaxCooperativeGroupCount);
+
+    uint32_t TotalGroupCount = KEnv.GroupCounts.groupCountX *
+                               KEnv.GroupCounts.groupCountY *
+                               KEnv.GroupCounts.groupCountZ;
+
+    if (TotalGroupCount > MaxCooperativeGroupCount) {
+      KernelPR.Mtx.unlock();
+      return Plugin::error(
+          ErrorCode::INVALID_ARGUMENT,
+          "cooperative kernel launch failed: requested %u groups exceeds "
+          "maximum %u cooperative groups supported by device",
+          TotalGroupCount, MaxCooperativeGroupCount);
+    }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Cooperative kernel validated: using %u groups (max: %u)\n",
+         TotalGroupCount, MaxCooperativeGroupCount);
+  }
+
   // Set kernel arguments.
-  for (int32_t I = 0; I < NumArgs; I++) {
-    // Scope code to ease integration with downstream custom code.
-    {
-      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
-      CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
-                        Arg == nullptr ? nullptr : &Arg);
+  uint32_t NumArgs = Properties.NumKernelArgs;
+  if (NumArgs > 0) {
+    if (!KernelArgs.ArgSizes && !Properties.ArgSizes)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "level zero plugin requires kernel argument sizes.");
+    // use user provided argument sizes if available, otherwise use sizes from
+    // kernel properties.
+    char *Arg = static_cast<char *>(LaunchParams.Data);
+    for (uint32_t I = 0; I < NumArgs; I++) {
+      uint32_t ArgSize = KernelArgs.ArgSizes
+                             ? static_cast<uint32_t>(KernelArgs.ArgSizes[I])
+                             : Properties.ArgSizes[I];
+      CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, ArgSize, Arg);
+
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+           "Kernel Pointer argument %" PRIu32 " (value: " DPxMOD
            ") was set successfully for device %s.\n",
            I, DPxPTR(Arg), IdStr);
+      Arg += ArgSize;
     }
   }
 
@@ -472,4 +531,32 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
 }
 
+Expected<uint32_t> L0KernelTy::getMaxCooperativeGroupCount(
+    GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+    const size_t *LocalWorkSize, size_t DynamicSharedMemorySize) const {
+  (void)DynamicSharedMemorySize;
+
+  // Set group size based on work dimensions
+  uint32_t GroupSize[3];
+  GroupSize[0] = static_cast<uint32_t>(LocalWorkSize[0]);
+  GroupSize[1] = WorkDim >= 2 ? static_cast<uint32_t>(LocalWorkSize[1]) : 1;
+  GroupSize[2] = WorkDim == 3 ? static_cast<uint32_t>(LocalWorkSize[2]) : 1;
+
+  ze_result_t Res =
+      zeKernelSetGroupSize(zeKernel, GroupSize[0], GroupSize[1], GroupSize[2]);
+  if (Res != ZE_RESULT_SUCCESS)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "failed to set group size for cooperative launch");
+
+  uint32_t MaxCooperativeGroupCount = 0;
+  Res = zeKernelSuggestMaxCooperativeGroupCount(zeKernel,
+                                                &MaxCooperativeGroupCount);
+
+  if (Res != ZE_RESULT_SUCCESS)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "failed to query max cooperative group count");
+
+  return MaxCooperativeGroupCount;
+}
+
 } // namespace llvm::omp::target::plugin
diff --git a/offload/tools/offload-tblgen/PrintGen.cpp b/offload/tools/offload-tblgen/PrintGen.cpp
index 89d7c820426cf..d2475d0d43a01 100644
--- a/offload/tools/offload-tblgen/PrintGen.cpp
+++ b/offload/tools/offload-tblgen/PrintGen.cpp
@@ -74,9 +74,10 @@ inline void printTagged(llvm::raw_ostream &os, const void *ptr, {0} value, size_
     if (Type == "char[]") {
       OS << formatv(TAB_2 "printPtr(os, (const char*) ptr);\n");
     } else {
-      if (Type == "void *")
-        OS << formatv(TAB_2 "void * const * const tptr = (void * "
-                            "const * const)ptr;\n");
+      if (Type.ends_with("*"))
+        OS << formatv(TAB_2 "const {0} const * tptr = (const {0} "
+                            "const *)ptr;\n",
+                      Type);
       else
         OS << formatv(
             TAB_2 "const {0} * const tptr = (const {0} * const)ptr;\n", Type);
diff --git a/offload/unittests/Conformance/lib/DeviceContext.cpp b/offload/unittests/Conformance/lib/DeviceContext.cpp
index 6e6c2738db510..d9cc0926760b7 100644
--- a/offload/unittests/Conformance/lib/DeviceContext.cpp
+++ b/offload/unittests/Conformance/lib/DeviceContext.cpp
@@ -296,7 +296,7 @@ void DeviceContext::launchKernelImpl(
   LaunchSizeArgs.DynSharedMemory = 0;
 
   OL_CHECK(olLaunchKernel(nullptr, DeviceHandle, KernelHandle, KernelArgs,
-                          KernelArgsSize, &LaunchSizeArgs));
+                          KernelArgsSize, &LaunchSizeArgs, NULL));
 }
 
 [[nodiscard]] llvm::StringRef DeviceContext::getName() const noexcept {
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 031dbea660fbc..6a9cc267672e3 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -22,7 +22,8 @@ target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
 
 add_offload_unittest("kernel"
     kernel/olCalculateOptimalOccupancy.cpp
-    kernel/olLaunchKernel.cpp)
+    kernel/olLaunchKernel.cpp
+    kernel/olLaunchKernelCooperative.cpp)
 
 add_offload_unittest("memory"
     memory/olMemAlloc.cpp
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 6f9961e2c6d58..9f91220b614aa 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -254,6 +254,43 @@ struct OffloadEventTest : OffloadQueueTest {
   ol_event_handle_t Event = nullptr;
 };
 
+struct LaunchKernelTestBase : OffloadQueueTest {
+  void SetUpProgram(const char *program) {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
+    ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+                                   DeviceBin->getBufferSize(), &Program));
+
+    LaunchArgs.Dimensions = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
+
+    LaunchArgs.DynSharedMemory = 0;
+  }
+
+  void TearDown() override {
+    if (Program) {
+      olDestroyProgram(Program);
+    }
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ol_program_handle_t Program = nullptr;
+  ol_kernel_launch_size_args_t LaunchArgs{};
+};
+
+struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
+  void SetUpKernel(const char *kernel) {
+    RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
+    ASSERT_SUCCESS(
+        olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
+  }
+
+  ol_symbol_handle_t Kernel = nullptr;
+};
+
 // Devices might not be available for offload testing, so allow uninstantiated
 // tests (as the device list will be empty). This means that all tests requiring
 // a device will be silently skipped.
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 166b8dabff0d8..2f444e48330f8 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -10,43 +10,6 @@
 #include <OffloadAPI.h>
 #include <gtest/gtest.h>
 
-struct LaunchKernelTestBase : OffloadQueueTest {
-  void SetUpProgram(const char *program) {
-    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
-    ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
-    ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
-    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
-                                   DeviceBin->getBufferSize(), &Program));
-
-    LaunchArgs.Dimensions = 1;
-    LaunchArgs.GroupSize = {64, 1, 1};
-    LaunchArgs.NumGroups = {1, 1, 1};
-
-    LaunchArgs.DynSharedMemory = 0;
-  }
-
-  void TearDown() override {
-    if (Program) {
-      olDestroyProgram(Program);
-    }
-    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
-  }
-
-  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
-  ol_program_handle_t Program = nullptr;
-  ol_kernel_launch_size_args_t LaunchArgs{};
-};
-
-struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
-  void SetUpKernel(const char *kernel) {
-    RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
-    ASSERT_SUCCESS(
-        olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
-  }
-
-  ol_symbol_handle_t Kernel = nullptr;
-};
-
 #define KERNEL_TEST(NAME, KERNEL)                                              \
   struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase {             \
     void SetUp() override { SetUpKernel(#KERNEL); }                            \
@@ -93,8 +56,8 @@ TEST_P(olLaunchKernelFooTest, Success) {
     void *Mem;
   } Args{Mem};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
 
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
@@ -116,7 +79,7 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
     } Args{Mem};
 
     ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
-                                  &LaunchArgs));
+                                  &LaunchArgs, NULL));
 
     ASSERT_SUCCESS(olSyncQueue(Queue));
 
@@ -131,7 +94,7 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
 
 TEST_P(olLaunchKernelNoArgsTest, Success) {
   ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, NULL));
 
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
@@ -143,9 +106,8 @@ TEST_P(olLaunchKernelMultiArgsTest, Success) {
     short C;
   } Args{0, nullptr, 0};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
-
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
 
@@ -159,7 +121,7 @@ TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
   } Args{Mem};
 
   ASSERT_SUCCESS(olLaunchKernel(nullptr, Device, Kernel, &Args, sizeof(Args),
-                                &LaunchArgs));
+                                &LaunchArgs, NULL));
 
   uint32_t *Data = (uint32_t *)Mem;
   for (uint32_t i = 0; i < 64; i++) {
@@ -182,8 +144,8 @@ TEST_P(olLaunchKernelLocalMemTest, Success) {
     void *Mem;
   } Args{Mem};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
 
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
@@ -205,8 +167,8 @@ TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
     void *Mem;
   } Args{Mem};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
 
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
@@ -228,8 +190,8 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
     void *Mem;
   } Args{Mem};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
 
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
@@ -249,10 +211,10 @@ TEST_P(olLaunchKernelGlobalTest, Success) {
   } Args{Mem};
 
   ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0, &LaunchArgs));
+      olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0, &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
   ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
-                                &LaunchArgs));
+                                &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
   uint32_t *Data = (uint32_t *)Mem;
@@ -267,8 +229,9 @@ TEST_P(olLaunchKernelGlobalTest, InvalidNotAKernel) {
   ol_symbol_handle_t Global = nullptr;
   ASSERT_SUCCESS(
       olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
-  ASSERT_ERROR(OL_ERRC_SYMBOL_KIND,
-               olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs));
+  ASSERT_ERROR(
+      OL_ERRC_SYMBOL_KIND,
+      olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs, NULL));
 }
 
 TEST_P(olLaunchKernelGlobalCtorTest, Success) {
@@ -279,8 +242,8 @@ TEST_P(olLaunchKernelGlobalCtorTest, Success) {
     void *Mem;
   } Args{Mem};
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
   uint32_t *Data = (uint32_t *)Mem;
@@ -296,6 +259,6 @@ TEST_P(olLaunchKernelGlobalDtorTest, Success) {
   // find/implement a way, update this test. For now we just check that nothing
   // crashes
   ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 }
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp
new file mode 100644
index 0000000000000..3abe3f916693f
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernelCooperative.cpp
@@ -0,0 +1,120 @@
+//===------- Offload API tests - olLaunchKernel Cooperative ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+struct olLaunchKernelCooperativeTest : LaunchSingleKernelTestBase {
+  void SetUp() override {
+    SetUpKernel("noargs");
+
+    bool SupportsCooperative = false;
+    auto Result =
+        olGetDeviceInfo(Device, OL_DEVICE_INFO_COOPERATIVE_LAUNCH_SUPPORT,
+                        sizeof(bool), &SupportsCooperative);
+
+    if (Result) {
+      if (Result->Code == OL_ERRC_UNIMPLEMENTED) {
+        GTEST_SKIP()
+            << "Device does not provide cooperative launch support information";
+      }
+      if (Result->Code != OL_ERRC_SUCCESS) {
+        GTEST_FAIL() << "olGetDeviceInfo failed with unexpected error: "
+                     << Result->Code << ": " << Result->Details;
+      }
+    }
+
+    if (!SupportsCooperative) {
+      GTEST_SKIP() << "Device does not support cooperative kernel launch";
+    }
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelCooperativeTest);
+
+TEST_P(olLaunchKernelCooperativeTest, GetMaxCooperativeGroupCount) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+      Device, Kernel, &LaunchArgs, &MaxGroupCount));
+  ASSERT_GT(MaxGroupCount, 0u);
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SuccessCooperative) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+      Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+  LaunchArgs.NumGroups.x = MaxGroupCount;
+
+  bool IsCooperative = true;
+  ol_kernel_launch_prop_t Props[] = {
+      {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+      OL_KERNEL_LAUNCH_PROP_END};
+
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SuccessNonCooperative) {
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, TooManyGroups) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+      Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+  LaunchArgs.NumGroups.x = MaxGroupCount * 2;
+
+  bool IsCooperative = true;
+  ol_kernel_launch_prop_t Props[] = {
+      {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+      OL_KERNEL_LAUNCH_PROP_END};
+
+  ASSERT_ANY_ERROR(
+      olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, SynchronousLaunch) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_SUCCESS(olGetKernelMaxCooperativeGroupCount(
+      Device, Kernel, &LaunchArgs, &MaxGroupCount));
+
+  LaunchArgs.NumGroups.x = std::min(MaxGroupCount, 2u);
+
+  bool IsCooperative = true;
+  ol_kernel_launch_prop_t Props[] = {
+      {OL_KERNEL_LAUNCH_PROP_TYPE_IS_COOPERATIVE, &IsCooperative},
+      OL_KERNEL_LAUNCH_PROP_END};
+
+  ASSERT_SUCCESS(
+      olLaunchKernel(nullptr, Device, Kernel, nullptr, 0, &LaunchArgs, Props));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullHandleKernel) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetKernelMaxCooperativeGroupCount(Device, nullptr, &LaunchArgs,
+                                                   &MaxGroupCount));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullHandleDevice) {
+  uint32_t MaxGroupCount = 0;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetKernelMaxCooperativeGroupCount(nullptr, Kernel, &LaunchArgs,
+                                                   &MaxGroupCount));
+}
+
+TEST_P(olLaunchKernelCooperativeTest, InvalidNullPointerGroupCountRet) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetKernelMaxCooperativeGroupCount(Device, Kernel, &LaunchArgs,
+                                                   nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
index cc67d782ef403..d9745c87e3496 100644
--- a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
@@ -171,7 +171,7 @@ TEST_P(olMemcpyGlobalTest, SuccessWrite) {
       olMemcpy(Queue, Addr, Device, SourceMem, Host, 64 * sizeof(uint32_t)));
   ASSERT_SUCCESS(olSyncQueue(Queue));
   ASSERT_SUCCESS(olLaunchKernel(Queue, Device, ReadKernel, &Args, sizeof(Args),
-                                &LaunchArgs));
+                                &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
 
   uint32_t *DestData = (uint32_t *)DestMem;
@@ -188,8 +188,8 @@ TEST_P(olMemcpyGlobalTest, SuccessRead) {
                             LaunchArgs.GroupSize.x * sizeof(uint32_t),
                             &DestMem));
 
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, WriteKernel, nullptr, 0, &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, WriteKernel, nullptr, 0,
+                                &LaunchArgs, NULL));
   ASSERT_SUCCESS(olSyncQueue(Queue));
   ASSERT_SUCCESS(
       olMemcpy(Queue, DestMem, Host, Addr, Device, 64 * sizeof(uint32_t)));
diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
index aa86750f6adf9..e0b3d1c280ac2 100644
--- a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
+++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
@@ -77,8 +77,8 @@ TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) {
   struct {
     void *Mem;
   } Args{Mem};
-  ASSERT_SUCCESS(
-      olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, NULL));
 
   std::this_thread::sleep_for(std::chrono::milliseconds(500));
   for (uint32_t i = 0; i < 64; i++) {
diff --git a/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp b/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
index 9838562752cc4..9011f17eff5e9 100644
--- a/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
+++ b/offload/unittests/OffloadAPI/queue/olWaitEvents.cpp
@@ -52,7 +52,7 @@ TEST_P(olWaitEventsTest, Success) {
       ASSERT_SUCCESS(olWaitEvents(Queues[I], &Events[I - 1], 1));
 
     ASSERT_SUCCESS(olLaunchKernel(Queues[I], Device, Kernel, &Args,
-                                  sizeof(Args), &LaunchArgs));
+                                  sizeof(Args), &LaunchArgs, NULL));
     ASSERT_SUCCESS(olCreateEvent(Queues[I], &Events[I]));
   }
 
@@ -86,7 +86,7 @@ TEST_P(olWaitEventsTest, SuccessSingleQueue) {
       ASSERT_SUCCESS(olWaitEvents(Queue, &Events[I - 1], 1));
 
     ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
-                                  &LaunchArgs));
+                                  &LaunchArgs, NULL));
     ASSERT_SUCCESS(olCreateEvent(Queue, &Events[I]));
   }
 
@@ -120,7 +120,7 @@ TEST_P(olWaitEventsTest, SuccessMultipleEvents) {
       ASSERT_SUCCESS(olWaitEvents(Queues[I], Events, I));
 
     ASSERT_SUCCESS(olLaunchKernel(Queues[I], Device, Kernel, &Args,
-                                  sizeof(Args), &LaunchArgs));
+                                  sizeof(Args), &LaunchArgs, NULL));
     ASSERT_SUCCESS(olCreateEvent(Queues[I], &Events[I]));
   }