[Mlir-commits] [mlir] [mlir][execution engine] Add runtime support to the `OffloadEmbeddingAttr` (PR #78116)

Sun Jan 14 14:33:59 PST 2024

https://github.com/fabianmcg created https://github.com/llvm/llvm-project/pull/78116

This patch adds runtime support to the `OffloadEmbeddingAttr` GPU translation attribute.

A valid CUDA Toolkit installation is required to enable this path. If the toolkit is found, then this path gets enabled by default. This change also allows the usage of the built-in CUDA RT context management functionality.

This patch is 2/4 on introducing the `OffloadEmbeddingAttr` GPU translation attribute.

>From 8df2a31fa0c31633a361028892cf8a95a75a8145 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Sun, 14 Jan 2024 22:17:34 +0000
Subject: [PATCH] [mlir][execution engine] Add runtime support to the
 `OffloadEmbeddingAttr`

This patch adds runtime support to the `OffloadEmbeddingAttr` GPU translation
attribute.

A valid CUDA Toolkit installation is required to enable this path. If the
toolkit is found, then this path gets enabled by default. This change allows the
usage of the built-in CUDA RT context management functionality.

This patch is 2/4 on introducing the `OffloadEmbeddingAttr` GPU translation
attribute.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt       | 13 +++++
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   | 48 ++++++++++++++++++-
 .../ExecutionEngine/RocmRuntimeWrappers.cpp   | 11 +++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 2f391b7698cbb0..e4c7c353f5e13a 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -217,6 +217,19 @@ if(LLVM_ENABLE_PIC)
       PRIVATE
       ${CUDA_RUNTIME_LIBRARY}
     )
+    find_package(CUDAToolkit)
+    if (CUDAToolkit_FOUND)
+      target_link_libraries(mlir_cuda_runtime PRIVATE CUDA::cudart)
+      target_compile_definitions(mlir_cuda_runtime
+        PRIVATE
+        MLIR_ENABLE_CUDA_RUNNER_RT=1
+      )
+    else()
+      target_compile_definitions(mlir_cuda_runtime
+        PRIVATE
+        MLIR_ENABLE_CUDA_RUNNER_RT=0
+      )
+    endif()
 
     if(MLIR_ENABLE_CUDA_CUSPARSE)
       # Find the libcusparse.so library if CUSPARSE build is requested.
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b9a3429e37b885..6c8983825283de 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -20,6 +20,10 @@
 #include "cuda_bf16.h"
 #include "cuda_fp16.h"
 
+#if MLIR_ENABLE_CUDA_RUNNER_RT == 1
+#include "cuda_runtime.h"
+#endif
+
 #ifdef MLIR_ENABLE_CUDA_CUSPARSE
 #include "cusparse.h"
 #ifdef MLIR_ENABLE_CUDA_CUSPARSELT
@@ -44,6 +48,16 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
+#define CUDART_REPORT_IF_ERROR(expr)                                           \
+  [](cudaError_t result) {                                                     \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = cudaGetErrorName(result);                               \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+  }(expr)
+
 #define CUSPARSE_REPORT_IF_ERROR(expr)                                         \
   {                                                                            \
     cusparseStatus_t status = (expr);                                          \
@@ -88,6 +102,14 @@ class ScopedContext {
   ScopedContext() {
     // Static reference to CUDA primary context for device ordinal
     // defaultDevice.
+#if MLIR_ENABLE_CUDA_RUNNER_RT == 1
+    static int rt_init = []() {
+      CUDART_REPORT_IF_ERROR(cudaInitDevice(defaultDevice, 0, 0));
+      return 0;
+    }();
+    (void)rt_init;
+    CUDART_REPORT_IF_ERROR(cudaSetDevice(defaultDevice));
+#else
     static CUcontext context = [] {
       CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
       CUcontext ctx;
@@ -98,9 +120,14 @@ class ScopedContext {
     }();
 
     CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+#endif
   }
 
-  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+  ~ScopedContext() {
+#if MLIR_ENABLE_CUDA_RUNNER_RT == 0
+    CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr));
+#endif
+  }
 };
 
 #ifdef MLIR_ENABLE_CUDA_CUSPARSE
@@ -194,6 +221,25 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
                                       extra));
 }
 
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+#if MLIR_ENABLE_CUDA_RUNNER_RT == 1
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuLaunchKernelRT(void *function, intptr_t gridX, intptr_t gridY,
+                   intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+                   intptr_t blockZ, int32_t smem, CUstream stream,
+                   void **params, void **extra, size_t /*paramsCount*/) {
+  debug_print("Launching kernel, grid=%ld,%ld,%ld, "
+              "threads: %ld, %ld, %ld, "
+              "smem: %dkb\n",
+              gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
+  CUDART_REPORT_IF_ERROR(cudaLaunchKernel(function, dim3(gridX, gridY, gridZ),
+                                          dim3(blockX, blockY, blockZ), params,
+                                          smem, stream));
+}
+#endif
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
   ScopedContext scopedContext;
   CUstream stream = nullptr;
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 11cf6d7b077c0f..3720aae9cb0c7c 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -68,6 +68,17 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
                                             stream, params, extra));
 }
 
+extern "C" void mgpuLaunchKernelRT(void *function, intptr_t gridX,
+                                   intptr_t gridY, intptr_t gridZ,
+                                   intptr_t blockX, intptr_t blockY,
+                                   intptr_t blockZ, int32_t smem,
+                                   hipStream_t stream, void **params,
+                                   void **extra, size_t /*paramsCount*/) {
+  HIP_REPORT_IF_ERROR(hipLaunchKernel(function, dim3(gridX, gridY, gridZ),
+                                      dim3(blockX, blockY, blockZ), params,
+                                      smem, stream));
+}
+
 extern "C" hipStream_t mgpuStreamCreate() {
   hipStream_t stream = nullptr;
   HIP_REPORT_IF_ERROR(hipStreamCreate(&stream));