[flang-commits] [flang] [flang][cuda] Add function to allocate and deallocate device module variable (PR #109213)

Wed Sep 18 17:40:50 PDT 2024

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/109213

>From 9fb92e2c8e36fbe5242b89a871637b11eba31ce8 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 17 Sep 2024 14:16:36 -0700
Subject: [PATCH 1/5] [flang][cuda] Add function to allocate and deallocate
 device module variable

---
 .../include/flang/Runtime/CUDA/allocatable.h  | 34 +++++++++
 flang/include/flang/Runtime/CUDA/allocator.h  | 11 ---
 flang/include/flang/Runtime/CUDA/common.h     | 30 ++++++++
 flang/include/flang/Runtime/CUDA/descriptor.h | 13 +++-
 flang/include/flang/Runtime/CUDA/memory.h     |  4 --
 .../Optimizer/Transforms/CufOpConversion.cpp  |  1 +
 flang/runtime/CUDA/CMakeLists.txt             |  4 ++
 flang/runtime/CUDA/allocatable.cpp            | 69 +++++++++++++++++++
 flang/runtime/CUDA/allocator.cpp              |  1 +
 flang/runtime/CUDA/descriptor.cpp             | 22 ++++++
 flang/unittests/Runtime/CUDA/Allocatable.cpp  | 60 ++++++++++++++++
 flang/unittests/Runtime/CUDA/CMakeLists.txt   |  8 +++
 12 files changed, 240 insertions(+), 17 deletions(-)
 create mode 100644 flang/include/flang/Runtime/CUDA/allocatable.h
 create mode 100644 flang/include/flang/Runtime/CUDA/common.h
 create mode 100644 flang/runtime/CUDA/allocatable.cpp
 create mode 100644 flang/unittests/Runtime/CUDA/Allocatable.cpp

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
new file mode 100644
index 00000000000000..e986ad910a3f3a
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -0,0 +1,34 @@
+//===-- include/flang/Runtime/CUDA/allocatable.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+#define FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+
+/// Perform allocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+/// Perform deallocation of the descriptor with synchronization of it when
+/// necessary.
+int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda
+#endif // FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 4527c9f18fa054..06bda81c6f75ad 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -12,17 +12,6 @@
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
-#define CUDA_REPORT_IF_ERROR(expr) \
-  [](cudaError_t err) { \
-    if (err == cudaSuccess) \
-      return; \
-    const char *name = cudaGetErrorName(err); \
-    if (!name) \
-      name = "<unknown>"; \
-    Terminator terminator{__FILE__, __LINE__}; \
-    terminator.Crash("'%s' failed with '%s'", #expr, name); \
-  }(expr)
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
new file mode 100644
index 00000000000000..cb8681da161f0d
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/common.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_
+#define FORTRAN_RUNTIME_CUDA_COMMON_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+
+static constexpr unsigned kHostToDevice = 0;
+static constexpr unsigned kDeviceToHost = 1;
+static constexpr unsigned kDeviceToDevice = 2;
+
+#define CUDA_REPORT_IF_ERROR(expr) \
+  [](cudaError_t err) { \
+    if (err == cudaSuccess) \
+      return; \
+    const char *name = cudaGetErrorName(err); \
+    if (!name) \
+      name = "<unknown>"; \
+    Terminator terminator{__FILE__, __LINE__}; \
+    terminator.Crash("'%s' failed with '%s'", #expr, name); \
+  }(expr)
+
+#endif // FORTRAN_RUNTIME_CUDA_COMMON_H_
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index d593989420420f..7b870c74cd7adb 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -17,14 +17,23 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
-// Allocate a descriptor in managed.
+/// Allocate a descriptor in managed.
 Descriptor *RTDECL(CUFAllocDesciptor)(
     std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
 
-// Deallocate a descriptor allocated in managed or unified memory.
+/// Deallocate a descriptor allocated in managed or unified memory.
 void RTDECL(CUFFreeDesciptor)(
     Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Retrieve the device descriptor's pointer from the host one.
+Descriptor *RTDECL(CUFGetDeviceDescAddress)(
+    Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
+
+/// Sync the \p src descriptor to the \p dst descriptor.
+void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
+
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
index 8fd51129e81fe0..33947248dc4831 100644
--- a/flang/include/flang/Runtime/CUDA/memory.h
+++ b/flang/include/flang/Runtime/CUDA/memory.h
@@ -13,10 +13,6 @@
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
 
-static constexpr unsigned kHostToDevice = 0;
-static constexpr unsigned kDeviceToHost = 1;
-static constexpr unsigned kDeviceToDevice = 2;
-
 namespace Fortran::runtime::cuda {
 
 extern "C" {
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index 03a1eb74343b43..2dc37f4df3aeec 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang/Runtime/allocatable.h"
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 490bb369b572f6..803ff01b945dc4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -15,8 +15,12 @@ set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR})
 
 add_flang_library(${CUFRT_LIBNAME}
   allocator.cpp
+  allocatable.cpp
   descriptor.cpp
   memory.cpp
+
+  LINK_COMPONENTS
+  Support
 )
 
 if (BUILD_SHARED_LIBS)
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
new file mode 100644
index 00000000000000..3eafadb7842274
--- /dev/null
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -0,0 +1,69 @@
+//===-- runtime/CUDA/allocatable.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/allocatable.h"
+#include "../stat.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocatable.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "cuda_runtime.h"
+
+namespace Fortran::runtime::cuda {
+
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the allocation is done
+  // from the host.
+  if (stat == StatOk) {
+    Descriptor *deviceAddr{
+        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableDeallocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+#ifndef RT_DEVICE_COMPILATION
+  // Descriptor synchronization is only done when the deallocation is done
+  // from the host.
+  if (stat == StatOk) {
+    Descriptor *deviceAddr{
+        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+  }
+#endif
+  return stat;
+}
+
+RT_EXT_API_GROUP_END
+
+} // extern "C"
+
+} // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index d4a473d58e86cd..85b3daf65a8ba4 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -13,6 +13,7 @@
 #include "../type-info.h"
 #include "flang/Common/Fortran.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
+#include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 1031b1e601b646..3eec0135b3d883 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/descriptor.h"
+#include "../terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
 extern "C" {
@@ -23,6 +27,24 @@ void RTDEF(CUFFreeDesciptor)(
   CUFFreeManaged(reinterpret_cast<void *>(desc));
 }
 
+Descriptor *RTDEF(CUFGetDeviceDescAddress)(
+    Descriptor &desc, const char *sourceFile, int sourceLine) {
+  Terminator terminator{sourceFile, sourceLine};
+  void *p;
+  CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, &desc));
+  if (!p) {
+    terminator.Crash("Could not retrieve symbol's address");
+  }
+  return (Descriptor *)p;
+}
+
+void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
+    const char *sourceFile, int sourceLine) {
+  std::size_t count{src->SizeInBytes()};
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(
+      (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice));
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp
new file mode 100644
index 00000000000000..0f7eb27789316c
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp
@@ -0,0 +1,60 @@
+//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/allocatable.h"
+#include "gtest/gtest.h"
+#include "../../../runtime/terminator.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocator-registry.h"
+
+#include "cuda_runtime.h"
+
+using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuda;
+
+static OwningPtr<Descriptor> createAllocatable(
+    Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+  return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+      CFI_attribute_allocatable);
+}
+
+TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  // Emulate a device descriptor for the purpose of unit testing part of the
+  // code.
+  Descriptor *device_desc;
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+
+  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
index ed0caece3d15db..30fb8c220233c0 100644
--- a/flang/unittests/Runtime/CUDA/CMakeLists.txt
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -1,11 +1,19 @@
 if (FLANG_CUF_RUNTIME)
 
 add_flang_unittest(FlangCufRuntimeTests
+  Allocatable.cpp
   AllocatorCUF.cpp
 )
 
+if (BUILD_SHARED_LIBS)
+  set(CUDA_RT_TARGET CUDA::cudart)
+else()
+  set(CUDA_RT_TARGET CUDA::cudart_static)
+endif()
+
 target_link_libraries(FlangCufRuntimeTests
   PRIVATE
+  ${CUDA_RT_TARGET}
   CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}
   FortranRuntime
 )

>From 3547be80e2f66b7f8ff89ca99bab40873ff9f513 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 18 Sep 2024 17:21:16 -0700
Subject: [PATCH 2/5] Make GetDeviceAddress more generic

---
 flang/include/flang/Runtime/CUDA/descriptor.h |  4 ++--
 flang/runtime/CUDA/allocatable.cpp            | 14 ++++++++------
 flang/runtime/CUDA/descriptor.cpp             |  8 ++++----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 7b870c74cd7adb..89e9dd2eeb5688 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -26,8 +26,8 @@ void RTDECL(CUFFreeDesciptor)(
     Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Retrieve the device descriptor's pointer from the host one.
-Descriptor *RTDECL(CUFGetDeviceDescAddress)(
-    Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
+void *RTDECL(CUFGetDeviceAddress)(
+    void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Sync the \p src descriptor to the \p dst descriptor.
 void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 3eafadb7842274..f464d1b561d4a6 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -37,9 +37,10 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
   if (stat == StatOk) {
-    Descriptor *deviceAddr{
-        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
-    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+    void *deviceAddr{
+        RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(
+        (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;
@@ -54,9 +55,10 @@ int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
   // Descriptor synchronization is only done when the deallocation is done
   // from the host.
   if (stat == StatOk) {
-    Descriptor *deviceAddr{
-        RTNAME(CUFGetDeviceDescAddress)(desc, sourceFile, sourceLine)};
-    RTDECL(CUFDescriptorSync)(deviceAddr, &desc, sourceFile, sourceLine);
+    void *deviceAddr{
+        RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
+    RTDECL(CUFDescriptorSync)(
+        (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 3eec0135b3d883..7ce1429cd94d4a 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -27,15 +27,15 @@ void RTDEF(CUFFreeDesciptor)(
   CUFFreeManaged(reinterpret_cast<void *>(desc));
 }
 
-Descriptor *RTDEF(CUFGetDeviceDescAddress)(
-    Descriptor &desc, const char *sourceFile, int sourceLine) {
+void *RTDEF(CUFGetDeviceAddress)(
+    void *hostPtr, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, &desc));
+  CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, hostPtr));
   if (!p) {
     terminator.Crash("Could not retrieve symbol's address");
   }
-  return (Descriptor *)p;
+  return p;
 }
 
 void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,

>From a56f0e91c15dea4bfc37799ee68b0a975ef1250e Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 18 Sep 2024 17:31:38 -0700
Subject: [PATCH 3/5] Fix call with RTDECL -> RTNAME

---
 flang/runtime/CUDA/allocatable.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index f464d1b561d4a6..3236a99bea3343 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -39,7 +39,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   if (stat == StatOk) {
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
-    RTDECL(CUFDescriptorSync)(
+    RTNAME(CUFDescriptorSync)(
         (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
@@ -57,7 +57,7 @@ int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
   if (stat == StatOk) {
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
-    RTDECL(CUFDescriptorSync)(
+    RTNAME(CUFDescriptorSync)(
         (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif

>From 852ee9091fd0b10d7c7237aed024d587bbc2ba8a Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 18 Sep 2024 17:36:17 -0700
Subject: [PATCH 4/5] clang-format

---
 flang/runtime/CUDA/allocatable.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 3236a99bea3343..fde3454dc3c890 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -40,7 +40,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
     RTNAME(CUFDescriptorSync)(
-        (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+    (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;
@@ -58,7 +58,7 @@ int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
     RTNAME(CUFDescriptorSync)(
-        (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+    (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;

>From b390c4e1ce093676ab14edeee8f28b2c556167ef Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 18 Sep 2024 17:40:37 -0700
Subject: [PATCH 5/5] More clang-format

---
 flang/runtime/CUDA/allocatable.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index fde3454dc3c890..cc0c647c6c9529 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -39,8 +39,8 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   if (stat == StatOk) {
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
-    RTNAME(CUFDescriptorSync)(
-    (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+    RTNAME(CUFDescriptorSync)
+    ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;
@@ -57,8 +57,8 @@ int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
   if (stat == StatOk) {
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
-    RTNAME(CUFDescriptorSync)(
-    (Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
+    RTNAME(CUFDescriptorSync)
+    ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
 #endif
   return stat;