[llvm] [offload] Add properties parameter to olLaunchKernel (PR #184343)

Thu Mar 5 08:09:35 PST 2026

================
@@ -1509,7 +1538,73 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
         },
         &GenericDevice.Plugin);
 
-  return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
+  return Plugin::check(Res, "error in cuLaunchKernelEx for '%s': %s",
+                       getName());
+}
+
+Expected<uint32_t> CUDAKernelTy::getMaxCooperativeGroupCount(
+    GenericDeviceTy &GenericDevice, uint32_t WorkDim,
+    const size_t *LocalWorkSize, size_t DynamicSharedMemorySize) const {
+  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+
+  uint32_t SupportsCooperative = 0;
+  if (auto Err = CUDADevice.getDeviceAttr(
+          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, SupportsCooperative))
+    return Err;
+
+  if (!SupportsCooperative) {
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "Device does not support cooperative launch");
+  }
+
+  // Calculate total local work size
+  size_t LocalWorkSizeTotal = LocalWorkSize[0];
+  LocalWorkSizeTotal *= (WorkDim >= 2 ? LocalWorkSize[1] : 1);
+  LocalWorkSizeTotal *= (WorkDim == 3 ? LocalWorkSize[2] : 1);
+
+  // Query max active blocks per multiprocessor
+  int MaxNumActiveGroupsPerCU = 0;
+  CUresult Res = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &MaxNumActiveGroupsPerCU, Func, LocalWorkSizeTotal,
+      DynamicSharedMemorySize);
+  if (auto Err = Plugin::check(
+          Res, "error in cuOccupancyMaxActiveBlocksPerMultiprocessor: %s"))
+    return Err;
+
+  assert(MaxNumActiveGroupsPerCU >= 0);
+
+  // Handle the case where we can't have all SMs active with at least 1 group
+  // per SM. In that case, the device is still able to run 1 work-group, hence
+  // we will manually check if it is possible with the available HW resources.
+  if (MaxNumActiveGroupsPerCU == 0) {
+    // Get max threads per block for this kernel
+    int MaxThreads;
+    Res = cuFuncGetAttribute(&MaxThreads,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    if (auto Err = Plugin::check(Res, "error in cuFuncGetAttribute: %s"))
+      return Err;
+
+    // Get max shared memory per block
+    uint32_t MaxSharedMem;
+    if (auto Err = CUDADevice.getDeviceAttr(
+            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
----------------
lplewa wrote:

done

https://github.com/llvm/llvm-project/pull/184343