[llvm-branch-commits] [flang] [flang][cuda] Data transfer with descriptor (PR #114302)

Wed Oct 30 14:08:42 PDT 2024

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/114302

>From e4c7e31c77bbfda563e4e2c9b591fe2f5cb2c259 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 30 Oct 2024 11:53:12 -0700
Subject: [PATCH 1/2] [flang][cuda] Data transfer with descriptor

---
 flang/runtime/CUDA/memory.cpp           | 34 +++++++++++++++++++--
 flang/unittests/Runtime/CUDA/Memory.cpp | 40 +++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index 4778a4ae77683f..f25d3b531c84f0 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -9,10 +9,32 @@
 #include "flang/Runtime/CUDA/memory.h"
 #include "../terminator.h"
 #include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/assign.h"
 
 #include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
+static void *MemmoveHostToDevice(
+    void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+  return dst;
+}
+
+static void *MemmoveDeviceToHost(
+    void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+  return dst;
+}
+
+static void *MemmoveDeviceToDevice(
+    void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+  return dst;
+}
+
 extern "C" {
 
 void *RTDEF(CUFMemAlloc)(
@@ -90,8 +112,16 @@ void RTDEF(CUFDataTransferPtrDesc)(void *addr, Descriptor *desc,
 void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
     unsigned mode, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-  terminator.Crash(
-      "not yet implemented: CUDA data transfer between two descriptors");
+  MemmoveFct memmoveFct;
+  if (mode == kHostToDevice) {
+    memmoveFct = &MemmoveHostToDevice;
+  } else if (mode == kDeviceToHost) {
+    memmoveFct = &MemmoveDeviceToHost;
+  } else if (mode == kDeviceToDevice) {
+    memmoveFct = &MemmoveDeviceToDevice;
+  }
+  Fortran::runtime::Assign(
+      dstDesc, srcDesc, terminator, MaybeReallocate, memmoveFct);
 }
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang/unittests/Runtime/CUDA/Memory.cpp
index 157d3cdb531def..ade05e21b70a89 100644
--- a/flang/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang/unittests/Runtime/CUDA/Memory.cpp
@@ -9,11 +9,17 @@
 #include "flang/Runtime/CUDA/memory.h"
 #include "gtest/gtest.h"
 #include "../../../runtime/terminator.h"
+#include "../tools.h"
 #include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/common.h"
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/allocatable.h"
+#include "flang/Runtime/allocator-registry.h"
 
 #include "cuda_runtime.h"
 
+using namespace Fortran::runtime;
 using namespace Fortran::runtime::cuda;
 
 TEST(MemoryCUFTest, SimpleAllocTramsferFree) {
@@ -29,3 +35,37 @@ TEST(MemoryCUFTest, SimpleAllocTramsferFree) {
   EXPECT_EQ(42, host);
   RTNAME(CUFMemFree)((void *)dev, kMemTypeDevice, __FILE__, __LINE__);
 }
+
+static OwningPtr<Descriptor> createAllocatable(
+    Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+  return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+      CFI_attribute_allocatable);
+}
+
+TEST(MemoryCUFTest, CUFDataTransferDescDesc) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // INTEGER(4), DEVICE, ALLOCATABLE :: a(:)
+  auto dev{createAllocatable(TypeCategory::Integer, 4)};
+  dev->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx());
+  RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*dev, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(dev->IsAllocated());
+
+  // Create temp array to transfer to device.
+  auto x{MakeArray<TypeCategory::Integer, 4>(std::vector<int>{10},
+      std::vector<int32_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})};
+  RTNAME(CUFDataTransferDescDesc)(*dev, *x, kHostToDevice, __FILE__, __LINE__);
+
+  // Retrieve data from device.
+  auto host{MakeArray<TypeCategory::Integer, 4>(std::vector<int>{10},
+      std::vector<int32_t>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})};
+  RTNAME(CUFDataTransferDescDesc)(
+      *host, *dev, kDeviceToHost, __FILE__, __LINE__);
+
+  for (unsigned i = 0; i < 10; ++i) {
+    EXPECT_EQ(*host->ZeroBasedIndexedElement<std::int32_t>(i), (std::int32_t)i);
+  }
+}

>From be6734745eba64a1e05886b30eba64409658b5ae Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 30 Oct 2024 14:08:23 -0700
Subject: [PATCH 2/2] fix call

---
 flang/runtime/CUDA/allocatable.cpp | 1 +
 flang/runtime/CUDA/memory.cpp      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 649ddb638abe6d..3f1e54262d583c 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -38,6 +38,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   if (stat == StatOk) {
     void *deviceAddr{
         RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
+    printf("Addresses: host %p device %p", &desc, deviceAddr);
     RTNAME(CUFDescriptorSync)
     ((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
   }
diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index f25d3b531c84f0..b49500d971ac90 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -121,7 +121,7 @@ void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
     memmoveFct = &MemmoveDeviceToDevice;
   }
   Fortran::runtime::Assign(
-      dstDesc, srcDesc, terminator, MaybeReallocate, memmoveFct);
+      *dstDesc, *srcDesc, terminator, MaybeReallocate, memmoveFct);
 }
 }
 } // namespace Fortran::runtime::cuda