[flang-commits] [flang] cdf447b - [flang][cuda] Add function to allocate and deallocate device module variable (#109213)

Wed Sep 18 20:22:09 PDT 2024

Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-09-18T20:22:06-07:00
New Revision: cdf447baa50e837961384fab1e4d087da30b6f3d

URL: https://github.com/llvm/llvm-project/commit/cdf447baa50e837961384fab1e4d087da30b6f3d
DIFF: https://github.com/llvm/llvm-project/commit/cdf447baa50e837961384fab1e4d087da30b6f3d.diff

LOG: [flang][cuda] Add function to allocate and deallocate device module variable (#109213)

This patch adds new runtime entry points that perform the simple
allocation/deallocation of module allocatable variable with cuda
attributes.
When the allocation is initiated on the host, the descriptor on the
device is synchronized. Both descriptors point to the same data on the
device.

This is the first PR of a stack.

Added: 
    flang/include/flang/Runtime/CUDA/allocatable.h
    flang/include/flang/Runtime/CUDA/common.h
    flang/runtime/CUDA/allocatable.cpp
    flang/unittests/Runtime/CUDA/Allocatable.cpp

Modified: 
    flang/include/flang/Runtime/CUDA/allocator.h
    flang/include/flang/Runtime/CUDA/descriptor.h
    flang/include/flang/Runtime/CUDA/memory.h
    flang/lib/Optimizer/Transforms/CufOpConversion.cpp
    flang/runtime/CUDA/CMakeLists.txt
    flang/runtime/CUDA/allocator.cpp
    flang/runtime/CUDA/descriptor.cpp
    flang/unittests/Runtime/CUDA/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
new file mode 100644
index 00000000000000..e986ad910a3f3a

--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -0,0 +1,34 @@
+//===-- include/flang/Runtime/CUDA/allocatable.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+#define FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+
+/// Perform allocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+/// Perform deallocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda
+#endif // FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_

diff  --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 4527c9f18fa054..06bda81c6f75ad 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -12,17 +12,6 @@
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
-#define CUDA_REPORT_IF_ERROR(expr) \
-  [](cudaError_t err) { \
-    if (err == cudaSuccess) \
-      return; \
-    const char *name = cudaGetErrorName(err); \
-    if (!name) \
-      name = "<unknown>"; \
-    Terminator terminator{__FILE__, __LINE__}; \
-    terminator.Crash("'%s' failed with '%s'", #expr, name); \
-  }(expr)
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {

diff  --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
new file mode 100644
index 00000000000000..cb8681da161f0d
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/common.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_
+#define FORTRAN_RUNTIME_CUDA_COMMON_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+static constexpr unsigned kHostToDevice = 0;
+static constexpr unsigned kDeviceToHost = 1;
+static constexpr unsigned kDeviceToDevice = 2;
+
+#define CUDA_REPORT_IF_ERROR(expr) \
+  [](cudaError_t err) { \
+    if (err == cudaSuccess) \
+      return; \
+    const char *name = cudaGetErrorName(err); \
+    if (!name) \
+      name = "<unknown>"; \
+    Terminator terminator{__FILE__, __LINE__}; \
+    terminator.Crash("'%s' failed with '%s'", #expr, name); \
+  }(expr)
+
+#endif // FORTRAN_RUNTIME_CUDA_COMMON_H_

diff  --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index d593989420420f..93791012fdcc73 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -17,14 +17,23 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
-// Allocate a descriptor in managed.
+/// Allocate a descriptor in managed.
 Descriptor *RTDECL(CUFAllocDesciptor)(
     std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
 
-// Deallocate a descriptor allocated in managed or unified memory.
+/// Deallocate a descriptor allocated in managed or unified memory.
 void RTDECL(CUFFreeDesciptor)(
     Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Retrieve the device pointer from the host one.
+void *RTDECL(CUFGetDeviceAddress)(
+    void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
+
+/// Sync the \p src descriptor to the \p dst descriptor.
+void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
+
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_

diff  --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
index 8fd51129e81fe0..33947248dc4831 100644
--- a/flang/include/flang/Runtime/CUDA/memory.h
+++ b/flang/include/flang/Runtime/CUDA/memory.h
@@ -13,10 +13,6 @@
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
 
-static constexpr unsigned kHostToDevice = 0;
-static constexpr unsigned kDeviceToHost = 1;
-static constexpr unsigned kDeviceToDevice = 2;
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {

diff  --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index 03a1eb74343b43..2dc37f4df3aeec 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang/Runtime/allocatable.h"

diff  --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 490bb369b572f6..803ff01b945dc4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -15,8 +15,12 @@ set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR})
 
 add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
+  allocatable.cpp
   descriptor.cpp
   memory.cpp
+
+  LINK_COMPONENTS
+  Support
 )
 
 if (BUILD_SHARED_LIBS)

diff  --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
new file mode 100644
index 00000000000000..cc0c647c6c9529
--- /dev/null
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -0,0 +1,71 @@
+//===-- runtime/CUDA/allocatable.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/allocatable.h"
+#include "../stat.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocatable.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "cuda_runtime.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the allocation is done
+  // from the host.
+  if (stat == StatOk) {
+    void *deviceAddr{
+        RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
+    RTNAME(CUFDescriptorSync)
+    ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableDeallocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the deallocation is done
+  // from the host.
+  if (stat == StatOk) {
+    void *deviceAddr{
+        RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
+    RTNAME(CUFDescriptorSync)
+    ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+RT_EXT_API_GROUP_END
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda

diff  --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index d4a473d58e86cd..85b3daf65a8ba4 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -13,6 +13,7 @@
 #include "../type-info.h"
 #include "flang/Common/Fortran.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda_runtime.h"

diff  --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 1031b1e601b646..7ce1429cd94d4a 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/descriptor.h"
+#include "../terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
 extern "C" {
@@ -23,6 +27,24 @@ void RTDEF(CUFFreeDesciptor)(
   CUFFreeManaged(reinterpret_cast<void *>(desc));
 }
 
+void *RTDEF(CUFGetDeviceAddress)(
+    void *hostPtr, const char *sourceFile, int sourceLine) {
+  Terminator terminator{sourceFile, sourceLine};
+  void *p;
+  CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, hostPtr));
+  if (!p) {
+    terminator.Crash("Could not retrieve symbol's address");
+  }
+  return p;
+}
+
+void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile, int sourceLine) {
+  std::size_t count{src->SizeInBytes()};
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(
+      (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice));
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda

diff  --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp
new file mode 100644
index 00000000000000..0f7eb27789316c
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp
@@ -0,0 +1,60 @@
+//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/allocatable.h"
+#include "gtest/gtest.h"
+#include "../../../runtime/terminator.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocator-registry.h"
+
+#include "cuda_runtime.h"
+
+using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuda;
+
+static OwningPtr<Descriptor> createAllocatable(
+    Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+  return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+      CFI_attribute_allocatable);
+}
+
+TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  // Emulate a device descriptor for the purpose of unit testing part of the
+  // code.
+  Descriptor *device_desc;
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}

diff  --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
index ed0caece3d15db..30fb8c220233c0 100644
--- a/flang/unittests/Runtime/CUDA/CMakeLists.txt
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -1,11 +1,19 @@
 if (FLANG_CUF_RUNTIME)
 
 add_flang_unittest(FlangCufRuntimeTests
+  Allocatable.cpp
   AllocatorCUF.cpp
 )
 
+if (BUILD_SHARED_LIBS)
+  set(CUDA_RT_TARGET CUDA::cudart)
+else()
+  set(CUDA_RT_TARGET CUDA::cudart_static)
+endif()
+
 target_link_libraries(FlangCufRuntimeTests
   PRIVATE
+  ${CUDA_RT_TARGET}
   CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}
   FortranRuntime
 )