[llvm-branch-commits] [flang] [flang][cuda] Add CUF allocator (PR #101216)

Tue Jul 30 13:48:54 PDT 2024

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/101216

>From 825e6efbbe20041b2b1591617f32abc12a0b42ff Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 12 Jul 2024 15:20:12 -0700
Subject: [PATCH 1/2] [flang][cuda] Add CUF allocator

---
 flang/CMakeLists.txt                          |  7 ++
 flang/include/flang/Runtime/CUDA/allocator.h  | 43 +++++++++
 flang/runtime/CMakeLists.txt                  |  3 +
 flang/runtime/CUDA/CMakeLists.txt             | 18 ++++
 flang/runtime/CUDA/allocator.cpp              | 62 +++++++++++++
 flang/unittests/Runtime/CMakeLists.txt        |  2 +
 flang/unittests/Runtime/CUDA/AllocatorCUF.cpp | 87 +++++++++++++++++++
 flang/unittests/Runtime/CUDA/CMakeLists.txt   | 15 ++++
 8 files changed, 237 insertions(+)
 create mode 100644 flang/include/flang/Runtime/CUDA/allocator.h
 create mode 100644 flang/runtime/CUDA/CMakeLists.txt
 create mode 100644 flang/runtime/CUDA/allocator.cpp
 create mode 100644 flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
 create mode 100644 flang/unittests/Runtime/CUDA/CMakeLists.txt

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 070c39eb6e9ab..971e5d5c93f23 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -461,6 +461,13 @@ option(FLANG_BUILD_TOOLS
 if (FLANG_BUILD_TOOLS)
   add_subdirectory(tools)
 endif()
+
+option(FLANG_CUF_RUNTIME
+  "Compile CUDA Fortran runtime sources" OFF)
+if (FLANG_CUF_RUNTIME)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
 add_subdirectory(runtime)
 
 if (LLVM_INCLUDE_EXAMPLES)
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
new file mode 100644
index 0000000000000..0738d1e3a8bf3
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -0,0 +1,43 @@
+//===-- include/flang/Runtime/CUDA/allocator.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
+#define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
+
+#include "flang/Runtime/descriptor.h"
+
+static constexpr unsigned kPinnedAllocatorPos = 1;
+static constexpr unsigned kDeviceAllocatorPos = 2;
+static constexpr unsigned kManagedAllocatorPos = 3;
+
+#define CUDA_REPORT_IF_ERROR(expr) \
+  [](CUresult result) { \
+    if (!result) \
+      return; \
+    const char *name = nullptr; \
+    cuGetErrorName(result, &name); \
+    if (!name) \
+      name = "<unknown>"; \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
+  }(expr)
+
+namespace Fortran::runtime::cuf {
+
+void CUFRegisterAllocator();
+
+void *CUFAllocPinned(std::size_t);
+void CUFFreePinned(void *);
+
+void *CUFAllocDevice(std::size_t);
+void CUFFreeDevice(void *);
+
+void *CUFAllocManaged(std::size_t);
+void CUFFreeManaged(void *);
+
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 1f3ae23dcbf12..4537b2d059d65 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -309,3 +309,6 @@ if (TARGET flang-new AND TARGET module_files)
   add_dependencies(FortranRuntime flang-new module_files)
 endif()
 
+if (FLANG_CUF_RUNTIME)
+  add_subdirectory(CUDA)
+endif()
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
new file mode 100644
index 0000000000000..e963b6062abc4
--- /dev/null
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -0,0 +1,18 @@
+#===-- runtime/CUDA/CMakeLists.txt -----------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+include_directories(${CUDAToolkit_INCLUDE_DIRS})
+find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+
+add_flang_library(CufRuntime
+  allocator.cpp
+)
+target_link_libraries(CufRuntime
+PRIVATE
+${CUDA_RUNTIME_LIBRARY}
+)
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
new file mode 100644
index 0000000000000..3c913e344335b
--- /dev/null
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -0,0 +1,62 @@
+//===-- runtime/CUDA/allocator.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/allocator.h"
+#include "../allocator-registry.h"
+#include "../derived.h"
+#include "../stat.h"
+#include "../terminator.h"
+#include "../type-info.h"
+#include "flang/Common/Fortran.h"
+#include "flang/ISO_Fortran_binding_wrapper.h"
+
+#include "cuda.h"
+
+namespace Fortran::runtime::cuf {
+
+void CUFRegisterAllocator() {
+  allocatorRegistry.Register(
+      kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
+  allocatorRegistry.Register(
+      kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
+  allocatorRegistry.Register(
+      kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
+}
+
+void *CUFAllocPinned(std::size_t sizeInBytes) {
+  void *p;
+  CUDA_REPORT_IF_ERROR(cuMemAllocHost(&p, sizeInBytes));
+  return p;
+}
+
+void CUFFreePinned(void *p) {
+  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+void *CUFAllocDevice(std::size_t sizeInBytes) {
+  CUdeviceptr p = 0;
+  CUDA_REPORT_IF_ERROR(cuMemAlloc(&p, sizeInBytes));
+  return reinterpret_cast<void *>(p);
+}
+
+void CUFFreeDevice(void *p) {
+  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+void *CUFAllocManaged(std::size_t sizeInBytes) {
+  CUdeviceptr p = 0;
+  CUDA_REPORT_IF_ERROR(
+      cuMemAllocManaged(&p, sizeInBytes, CU_MEM_ATTACH_GLOBAL));
+  return reinterpret_cast<void *>(p);
+}
+
+void CUFFreeManaged(void *p) {
+  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+} // namespace Fortran::runtime::cuf
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index ed047b08ada35..2c3f8c1a9e9ac 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -35,3 +35,5 @@ target_link_libraries(FlangRuntimeTests
   PRIVATE
   FortranRuntime
 )
+
+add_subdirectory(CUDA)
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
new file mode 100644
index 0000000000000..204826d3f2a96
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -0,0 +1,87 @@
+//===-- flang/unittests/Runtime/AllocatableCUF.cpp ---------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/allocatable.h"
+
+#include "cuda.h"
+
+using namespace Fortran::runtime;
+
+static OwningPtr<Descriptor> createAllocatable(
+    Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+  return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+      CFI_attribute_allocatable);
+}
+
+thread_local static int32_t defaultDevice = 0;
+
+CUdevice getDefaultCuDevice() {
+  CUdevice device;
+  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+  return device;
+}
+
+class ScopedContext {
+public:
+  ScopedContext() {
+    // Static reference to CUDA primary context for device ordinal
+    // defaultDevice.
+    static CUcontext context = [] {
+      CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
+      CUcontext ctx;
+      // Note: this does not affect the current context.
+      CUDA_REPORT_IF_ERROR(
+          cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
+      return ctx;
+    }();
+
+    CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+  }
+
+  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+};
+
+TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->raw().SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->raw().GetAllocIdx());
+  EXPECT_FALSE(a->raw().HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+}
+
+TEST(AllocatableCUFTest, SimplePinnedAllocate) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // INTEGER(4), PINNED, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Integer, 4)};
+  EXPECT_FALSE(a->raw().HasAddendum());
+  a->raw().SetAllocIdx(kPinnedAllocatorPos);
+  EXPECT_EQ((int)kPinnedAllocatorPos, a->raw().GetAllocIdx());
+  EXPECT_FALSE(a->raw().HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+}
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
new file mode 100644
index 0000000000000..14b5c788719b8
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (FLANG_CUF_RUNTIME)
+
+add_flang_unittest(FlangCufRuntimeTests
+  AllocatorCUF.cpp
+)
+
+target_link_libraries(FlangCufRuntimeTests
+  PRIVATE
+  CufRuntime
+  FortranRuntime
+)
+
+target_include_directories(FlangCufRuntimeTests PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+endif()

>From 86ce320d39a6ea0fd25fad592453bdc2033e6103 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 30 Jul 2024 13:48:42 -0700
Subject: [PATCH 2/2] Use Terminator and switch pinned deallocator to
 cuFreeMemHost

---
 flang/include/flang/Runtime/CUDA/allocator.h  | 3 ++-
 flang/runtime/CUDA/allocator.cpp              | 4 ++--
 flang/unittests/Runtime/CUDA/AllocatorCUF.cpp | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 0738d1e3a8bf3..9f6fb55bea744 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -23,7 +23,8 @@ static constexpr unsigned kManagedAllocatorPos = 3;
     cuGetErrorName(result, &name); \
     if (!name) \
       name = "<unknown>"; \
-    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
+    Terminator terminator{__FILE__, __LINE__}; \
+    terminator.Crash("'%s' failed with '%s'", #expr, name); \
   }(expr)
 
 namespace Fortran::runtime::cuf {
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index 3c913e344335b..899532a1a5e1c 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "../terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "../allocator-registry.h"
 #include "../derived.h"
 #include "../stat.h"
-#include "../terminator.h"
 #include "../type-info.h"
 #include "flang/Common/Fortran.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
@@ -35,7 +35,7 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
 }
 
 void CUFFreePinned(void *p) {
-  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+  CUDA_REPORT_IF_ERROR(cuMemFreeHost(p));
 }
 
 void *CUFAllocDevice(std::size_t sizeInBytes) {
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 204826d3f2a96..caa62be6aa921 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,6 +10,7 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/allocatable.h"
+#include "../../../runtime/terminator.h"
 
 #include "cuda.h"