[flang-commits] [flang] [flang][cuda] Add function to allocate and deallocate device module variable (PR #109213)

Wed Sep 18 15:39:54 PDT 2024

https://github.com/clementval created https://github.com/llvm/llvm-project/pull/109213

This patch adds new runtime entry points that perform the simple allocation/deallocation of module allocatable variable with cuda attributes.
When the allocation is initiated on the host, the descriptor on the device is synchronized. Both descriptors point to the same data on the device. 

This is the first PR of a stack.

>From 9fb92e2c8e36fbe5242b89a871637b11eba31ce8 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 17 Sep 2024 14:16:36 -0700
Subject: [PATCH] [flang][cuda] Add function to allocate and deallocate device
 module variable

---
 .../include/flang/Runtime/CUDA/allocatable.h  | 34 +++++++++
 flang/include/flang/Runtime/CUDA/allocator.h  | 11 ---
 flang/include/flang/Runtime/CUDA/common.h     | 30 ++++++++
 flang/include/flang/Runtime/CUDA/descriptor.h | 13 +++-
 flang/include/flang/Runtime/CUDA/memory.h     |  4 --
 .../Optimizer/Transforms/CufOpConversion.cpp  |  1 +
 flang/runtime/CUDA/CMakeLists.txt             |  4 ++
 flang/runtime/CUDA/allocatable.cpp            | 69 +++++++++++++++++++
 flang/runtime/CUDA/allocator.cpp              |  1 +
 flang/runtime/CUDA/descriptor.cpp             | 22 ++++++
 flang/unittests/Runtime/CUDA/Allocatable.cpp  | 60 ++++++++++++++++
 flang/unittests/Runtime/CUDA/CMakeLists.txt   |  8 +++
 12 files changed, 240 insertions(+), 17 deletions(-)
 create mode 100644 flang/include/flang/Runtime/CUDA/allocatable.h
 create mode 100644 flang/include/flang/Runtime/CUDA/common.h
 create mode 100644 flang/runtime/CUDA/allocatable.cpp
 create mode 100644 flang/unittests/Runtime/CUDA/Allocatable.cpp

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
new file mode 100644
index 00000000000000..e986ad910a3f3a
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -0,0 +1,34 @@
+//===-- include/flang/Runtime/CUDA/allocatable.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+#define FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+
+/// Perform allocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+/// Perform deallocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda
+#endif // FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 4527c9f18fa054..06bda81c6f75ad 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -12,17 +12,6 @@
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
-#define CUDA_REPORT_IF_ERROR(expr) \
-  [](cudaError_t err) { \
-    if (err == cudaSuccess) \
-      return; \
-    const char *name = cudaGetErrorName(err); \
-    if (!name) \
-      name = "<unknown>"; \
-    Terminator terminator{__FILE__, __LINE__}; \
-    terminator.Crash("'%s' failed with '%s'", #expr, name); \
-  }(expr)
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
new file mode 100644
index 00000000000000..cb8681da161f0d
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/common.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_
+#define FORTRAN_RUNTIME_CUDA_COMMON_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+static constexpr unsigned kHostToDevice = 0;
+static constexpr unsigned kDeviceToHost = 1;
+static constexpr unsigned kDeviceToDevice = 2;
+
+#define CUDA_REPORT_IF_ERROR(expr) \
+  [](cudaError_t err) { \
+    if (err == cudaSuccess) \
+      return; \
+    const char *name = cudaGetErrorName(err); \
+    if (!name) \
+      name = "<unknown>"; \
+    Terminator terminator{__FILE__, __LINE__}; \
+    terminator.Crash("'%s' failed with '%s'", #expr, name); \
+  }(expr)
+
+#endif // FORTRAN_RUNTIME_CUDA_COMMON_H_
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index d593989420420f..7b870c74cd7adb 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -17,14 +17,23 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
-// Allocate a descriptor in managed.
+/// Allocate a descriptor in managed.
 Descriptor *RTDECL(CUFAllocDesciptor)(
     std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
 
-// Deallocate a descriptor allocated in managed or unified memory.
+/// Deallocate a descriptor allocated in managed or unified memory.
 void RTDECL(CUFFreeDesciptor)(
     Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Retrieve the device descriptor's pointer from the host one.
+Descriptor *RTDECL(CUFGetDeviceDescAddress)(
+    Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
+
+/// Sync the \p src descriptor to the \p dst descriptor.
+void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
+
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
index 8fd51129e81fe0..33947248dc4831 100644
--- a/flang/include/flang/Runtime/CUDA/memory.h
+++ b/flang/include/flang/Runtime/CUDA/memory.h
@@ -13,10 +13,6 @@
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
 
-static constexpr unsigned kHostToDevice = 0;
-static constexpr unsigned kDeviceToHost = 1;
-static constexpr unsigned kDeviceToDevice = 2;
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index 03a1eb74343b43..2dc37f4df3aeec 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang/Runtime/allocatable.h"
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 490bb369b572f6..803ff01b945dc4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -15,8 +15,12 @@ set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR})
 
 add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
+  allocatable.cpp
   descriptor.cpp
   memory.cpp
+
+  LINK_COMPONENTS
+  Support
 )
 
 if (BUILD_SHARED_LIBS)
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
new file mode 100644
index 00000000000000..3eafadb7842274
--- /dev/null
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -0,0 +1,69 @@
+//===-- runtime/CUDA/allocatable.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/allocatable.h"
+#include "../stat.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocatable.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "cuda_runtime.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the allocation is done
+  // from the host.
+  if (stat == StatOk) {
+    Descriptor *deviceAddr{
+        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableDeallocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the deallocation is done
+  // from the host.
+  if (stat == StatOk) {
+    Descriptor *deviceAddr{
+        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+RT_EXT_API_GROUP_END
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index d4a473d58e86cd..85b3daf65a8ba4 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -13,6 +13,7 @@
 #include "../type-info.h"
 #include "flang/Common/Fortran.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 1031b1e601b646..3eec0135b3d883 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/descriptor.h"
+#include "../terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
 extern "C" {
@@ -23,6 +27,24 @@ void RTDEF(CUFFreeDesciptor)(
   CUFFreeManaged(reinterpret_cast<void *>(desc));
 }
 
+Descriptor *RTDEF(CUFGetDeviceDescAddress)(
+    Descriptor &desc, const char *sourceFile, int sourceLine) {
+  Terminator terminator{sourceFile, sourceLine};
+  void *p;
+  CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, &desc));
+  if (!p) {
+    terminator.Crash("Could not retrieve symbol's address");
+  }
+  return (Descriptor *)p;
+}
+
+void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile, int sourceLine) {
+  std::size_t count{src->SizeInBytes()};
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(
+      (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice));
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp
new file mode 100644
index 00000000000000..0f7eb27789316c
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp
@@ -0,0 +1,60 @@
+//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/allocatable.h"
+#include "gtest/gtest.h"
+#include "../../../runtime/terminator.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocator-registry.h"
+
+#include "cuda_runtime.h"
+
+using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuda;
+
+static OwningPtr<Descriptor> createAllocatable(
+    Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+  return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+      CFI_attribute_allocatable);
+}
+
+TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  // Emulate a device descriptor for the purpose of unit testing part of the
+  // code.
+  Descriptor *device_desc;
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
index ed0caece3d15db..30fb8c220233c0 100644
--- a/flang/unittests/Runtime/CUDA/CMakeLists.txt
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -1,11 +1,19 @@
 if (FLANG_CUF_RUNTIME)
 
 add_flang_unittest(FlangCufRuntimeTests
+  Allocatable.cpp
   AllocatorCUF.cpp
 )
 
+if (BUILD_SHARED_LIBS)
+  set(CUDA_RT_TARGET CUDA::cudart)
+else()
+  set(CUDA_RT_TARGET CUDA::cudart_static)
+endif()
+
 target_link_libraries(FlangCufRuntimeTests
   PRIVATE
+  ${CUDA_RT_TARGET}
   CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}
   FortranRuntime
 )