[Mlir-commits] [mlir] 5388149 - [mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute

Wed Aug 2 05:19:05 PDT 2023

Author: Guray Ozen
Date: 2023-08-02T14:18:59+02:00
New Revision: 53881490c2edae7cdaa186f0c371e4ef27e0ed5e

URL: https://github.com/llvm/llvm-project/commit/53881490c2edae7cdaa186f0c371e4ef27e0ed5e
DIFF: https://github.com/llvm/llvm-project/commit/53881490c2edae7cdaa186f0c371e4ef27e0ed5e.diff

LOG: [mlir][cuda runtime] Set Max Dynamic Shared Memory Attribute

This works aims to address the issue related to larger shared memory usage in the MLIR CUDA runtime. Currently, when the shared memory usage exceeds 48KB, we need to set the CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute of the CUDA kernel appropriately. This work takes care of that by setting the attribute as required. Additionally, it includes some debug prints for better visibility and troubleshooting.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D156874

Added: 
    

Modified: 
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index ea3e19f141285e..664293c53570d2 100644

--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -73,6 +73,13 @@ bool isDebugEnabled() {
               __func__, __VA_ARGS__);                                          \
   } while (0)
 
+// Returns default CUdevice
+CUdevice getDefaultCuDevice() {
+  CUdevice device;
+  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+  return device;
+}
+
 // Make the primary context of the current default device current for the
 // duration
 //  of the instance and restore the previous context on destruction.
@@ -83,11 +90,10 @@ class ScopedContext {
     // defaultDevice.
     static CUcontext context = [] {
       CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
-      CUdevice device;
-      CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
       CUcontext ctx;
       // Note: this does not affect the current context.
-      CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
+      CUDA_REPORT_IF_ERROR(
+          cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
       return ctx;
     }();
 
@@ -140,6 +146,24 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
                  intptr_t blockZ, int32_t smem, CUstream stream, void **params,
                  void **extra) {
   ScopedContext scopedContext;
+  int32_t maxShmem = 0;
+  CUdevice device = getDefaultCuDevice();
+  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+  CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
+      &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  if (maxShmem < smem) {
+    fprintf(stderr,
+            "Requested shared memory (%dkb) is larger than maximum allowed "
+            "shared memory (%dkb) for this device\n",
+            smem, maxShmem);
+  }
+  CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
+      function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
+  debug_print("Launching kernel, grid=%ld,%ld,%ld, "
+              "threads: %ld, %ld, %ld, "
+              "smem: %dkb\n",
+              gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
   CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
                                       blockY, blockZ, smem, stream, params,
                                       extra));