[Mlir-commits] [mlir] b4117fe - Fix CUDA runtime wrapper for GPU mem alloc/free to async

Mon Apr 11 21:04:15 PDT 2022

Author: Uday Bondhugula
Date: 2022-04-12T09:04:02+05:30
New Revision: b4117fede20b8c649320ad37364ae208baa0d0e7

URL: https://github.com/llvm/llvm-project/commit/b4117fede20b8c649320ad37364ae208baa0d0e7
DIFF: https://github.com/llvm/llvm-project/commit/b4117fede20b8c649320ad37364ae208baa0d0e7.diff

LOG: Fix CUDA runtime wrapper for GPU mem alloc/free to async

Switch CUDA runtime wrapper for GPU mem alloc/free to async. The
semantics of the GPU dialect ops (gpu.alloc/dealloc) and the wrappers it
lowered to (gpu-to-llvm) was for the async versions -- however, this was
being incorrectly mapped to cuMemAlloc/cuMemFree instead of
cuMemAllocAsync/cuMemFreeAsync.

Reviewed By: csigg

Differential Revision: https://reviews.llvm.org/D123482

Added: 
    

Modified: 
    mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 44ed5b0cd2057..18b6c589cb91f 100644

--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -18,6 +18,12 @@
 
 #include "cuda.h"
 
+// We need to know the CUDA version to determine how to map some of the runtime
+// calls below.
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#endif
+
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
@@ -134,15 +140,28 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream stream) {
   ScopedContext scopedContext;
   CUdeviceptr ptr;
+#if CUDA_VERSION >= 11020
+  // Use the async version that was available since CUDA 11.2.
+  CUDA_REPORT_IF_ERROR(cuMemAllocAsync(&ptr, sizeBytes, stream));
+#else
   CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+  (void)stream;
+#endif
   return reinterpret_cast<void *>(ptr);
 }
 
-extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) {
+extern "C" void mgpuMemFree(void *ptr, CUstream stream) {
+#if CUDA_VERSION >= 11020
+  // Use the async version that was available since CUDA 11.2.
+  CUDA_REPORT_IF_ERROR(
+      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), stream));
+#else
   CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
+  (void)stream;
+#endif
 }
 
 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,