[flang-commits] [flang] [flang][cuda] Add support for allocate with source (PR #117388)

Fri Nov 22 14:02:23 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

Add support for allocate statement with CUDA device variable and a source.


---
Full diff: https://github.com/llvm/llvm-project/pull/117388.diff


8 Files Affected:

- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12) 
- (added) flang/include/flang/Runtime/CUDA/memmove-function.h (+23) 
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+42-19) 
- (modified) flang/runtime/CUDA/CMakeLists.txt (+1) 
- (modified) flang/runtime/CUDA/allocatable.cpp (+28) 
- (added) flang/runtime/CUDA/memmove-function.cpp (+35) 
- (modified) flang/runtime/CUDA/memory.cpp (+1-20) 
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+45) 


``````````diff

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index e986ad910a3f3a..bbfcd2cafcdb21 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -22,6 +22,18 @@ int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
+/// Perform allocation of the descriptor without synchronization. Assign data
+/// from source.
+int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
+    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine);
+
+/// Perform allocation of the descriptor with synchronization of it when
+/// necessary. Assign data from source.
+int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
+    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine);
+
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
diff --git a/flang/include/flang/Runtime/CUDA/memmove-function.h b/flang/include/flang/Runtime/CUDA/memmove-function.h
new file mode 100644
index 00000000000000..74d6a05eff4c9a
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/memmove-function.h
@@ -0,0 +1,23 @@
+//===-- include/flang/Runtime/CUDA/memmove-function.h -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+
+#ifndef FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
+#define FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
+
+namespace Fortran::runtime::cuda {
+
+void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count);
+
+void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count);
+
+void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count);
+
+} // namespace Fortran::runtime::cuda
+#endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index f1ebd08967b9a1..3983336516db9e 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -155,8 +155,12 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   auto fTy = func.getFunctionType();
 
   mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-  mlir::Value sourceLine =
-      fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+  mlir::Value sourceLine;
+  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
+    sourceLine = fir::factory::locationToLineNo(
+        builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+  else
+    sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
   mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
                                         : builder.createBool(loc, false);
@@ -168,8 +172,21 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
     errmsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
   }
-  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-      builder, loc, fTy, op.getBox(), hasStat, errmsg, sourceFile, sourceLine)};
+  llvm::SmallVector<mlir::Value> args;
+  if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    if (op.getSource())
+      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+                                           op.getSource(), hasStat, errmsg,
+                                           sourceFile, sourceLine);
+    else
+      args =
+          fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
+                                        errmsg, sourceFile, sourceLine);
+  } else {
+    args =
+        fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
+                                      errmsg, sourceFile, sourceLine);
+  }
   auto callOp = builder.create<fir::CallOp>(loc, func, args);
   rewriter.replaceOp(op, callOp);
   return mlir::success();
@@ -182,10 +199,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Allocation with source will need a new entry point in the runtime.
-    if (op.getSource())
-      return mlir::failure();
-
     // TODO: Allocation using different stream.
     if (op.getStream())
       return mlir::failure();
@@ -202,18 +215,28 @@ struct CUFAllocateOpConversion
     if (hasDoubleDescriptors(op)) {
       // Allocation for module variable are done with custom runtime entry point
       // so the descriptors can be synchronized.
-      mlir::func::FuncOp func =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
-              loc, builder);
-      return convertOpToCall(op, rewriter, func);
+      mlir::func::FuncOp func;
+      if (op.getSource())
+        func = fir::runtime::getRuntimeFunc<mkRTKey(
+            CUFAllocatableAllocateSourceSync)>(loc, builder);
+      else
+        func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
+            loc, builder);
+      return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
     }
 
-    // Allocation for local descriptor falls back on the standard runtime
-    // AllocatableAllocate as the dedicated allocator is set in the descriptor
-    // before the call.
-    mlir::func::FuncOp func =
-        fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
-                                                                   builder);
+    mlir::func::FuncOp func;
+    if (op.getSource())
+      func =
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
+              loc, builder);
+    else
+      // Allocation for local descriptor falls back on the standard runtime
+      // AllocatableAllocate as the dedicated allocator is set in the descriptor
+      // before the call.
+      func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+          loc, builder);
+
     return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
   }
 };
@@ -236,7 +259,7 @@ struct CUFDeallocateOpConversion
       mlir::func::FuncOp func =
           fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableDeallocate)>(
               loc, builder);
-      return convertOpToCall(op, rewriter, func);
+      return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
     }
 
     // Deallocation for local descriptor falls back on the standard runtime
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index ce87f3efdc3632..3a88824826de31 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -18,6 +18,7 @@ add_flang_library(${CUFRT_LIBNAME}
   allocatable.cpp
   descriptor.cpp
   kernel.cpp
+  memmove-function.cpp
   memory.cpp
   registration.cpp
 )
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 649ddb638abe6d..9fed50c859a9cf 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/allocatable.h"
+#include "../assign-impl.h"
 #include "../stat.h"
 #include "../terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
 #include "flang/Runtime/allocatable.h"
 
 #include "cuda_runtime.h"
@@ -45,6 +47,32 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   return stat;
 }
 
+int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
+    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
+  int stat{RTNAME(AllocatableAllocate)(
+      alloc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (stat == StatOk) {
+    Terminator terminator{sourceFile, sourceLine};
+    Fortran::runtime::DoFromSourceAssign(
+        alloc, source, terminator, &MemmoveHostToDevice);
+  }
+  return stat;
+}
+
+int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
+    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
+  int stat{RTNAME(AllocatableAllocate)(
+      alloc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (stat == StatOk) {
+    Terminator terminator{sourceFile, sourceLine};
+    Fortran::runtime::DoFromSourceAssign(
+        alloc, source, terminator, &MemmoveHostToDevice);
+  }
+  return stat;
+}
+
 int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   // Perform the standard allocation.
diff --git a/flang/runtime/CUDA/memmove-function.cpp b/flang/runtime/CUDA/memmove-function.cpp
new file mode 100644
index 00000000000000..3ba9fa7e0f7f73
--- /dev/null
+++ b/flang/runtime/CUDA/memmove-function.cpp
@@ -0,0 +1,35 @@
+//===-- runtime/CUDA/memmove-function.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/memmove-function.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+namespace Fortran::runtime::cuda {
+
+void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+  return dst;
+}
+
+void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+  return dst;
+}
+
+void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
+  return dst;
+}
+
+} // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index 68963c4d7738ac..0bbb493d2db919 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -11,31 +11,12 @@
 #include "../terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
 #include "flang/Runtime/assign.h"
 
 #include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
-static void *MemmoveHostToDevice(
-    void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
-  return dst;
-}
-
-static void *MemmoveDeviceToHost(
-    void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
-  return dst;
-}
-
-static void *MemmoveDeviceToDevice(
-    void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
-  return dst;
-}
 
 extern "C" {
 
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index d68ff894d5af5a..47d75b16b7a2d2 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -120,4 +120,49 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
+
+func.func @_QPallocate_source() {
+  %c0_i64 = arith.constant 0 : i64
+  %c1_i32 = arith.constant 1 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
+  %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %5 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a_d", data_attr = #cuf.cuda<device>, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %7 = fir.declare %5 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %8 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %22 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%8 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QPallocate_source()
+// CHECK: %[[DECL_HOST:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[DECL_DEV:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
+
+fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+  %c0 = arith.constant 0 : index
+  %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+  %1 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2>
+  %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+  fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+}
+func.func @_QMmod1Pallocate_source_global() {
+  %0 = fir.address_of(@_QMmod1Ea_d) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
+  %6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
+
 } // end of module

``````````

</details>


https://github.com/llvm/llvm-project/pull/117388