[flang-commits] [flang] [flang][cuda] Add support for allocate with source (PR #117388)
via flang-commits
flang-commits at lists.llvm.org
Fri Nov 22 14:02:23 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Add support for allocate statement with CUDA device variable and a source.
---
Full diff: https://github.com/llvm/llvm-project/pull/117388.diff
8 Files Affected:
- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12)
- (added) flang/include/flang/Runtime/CUDA/memmove-function.h (+23)
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+42-19)
- (modified) flang/runtime/CUDA/CMakeLists.txt (+1)
- (modified) flang/runtime/CUDA/allocatable.cpp (+28)
- (added) flang/runtime/CUDA/memmove-function.cpp (+35)
- (modified) flang/runtime/CUDA/memory.cpp (+1-20)
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+45)
``````````diff
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index e986ad910a3f3a..bbfcd2cafcdb21 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -22,6 +22,18 @@ int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
+/// Perform allocation of the descriptor without synchronization. Assign data
+/// from source.
+int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
+ const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine);
+
+/// Perform allocation of the descriptor with synchronization of it when
+/// necessary. Assign data from source.
+int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
+ const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine);
+
/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
diff --git a/flang/include/flang/Runtime/CUDA/memmove-function.h b/flang/include/flang/Runtime/CUDA/memmove-function.h
new file mode 100644
index 00000000000000..74d6a05eff4c9a
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/memmove-function.h
@@ -0,0 +1,23 @@
+//===-- include/flang/Runtime/CUDA/memmove-function.h -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+
+#ifndef FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
+#define FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
+
+namespace Fortran::runtime::cuda {
+
+void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count);
+
+void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count);
+
+void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count);
+
+} // namespace Fortran::runtime::cuda
+#endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index f1ebd08967b9a1..3983336516db9e 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -155,8 +155,12 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
auto fTy = func.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
- mlir::Value sourceLine =
- fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+ mlir::Value sourceLine;
+ if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
+ sourceLine = fir::factory::locationToLineNo(
+ builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+ else
+ sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
: builder.createBool(loc, false);
@@ -168,8 +172,21 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
errmsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
}
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, op.getBox(), hasStat, errmsg, sourceFile, sourceLine)};
+ llvm::SmallVector<mlir::Value> args;
+ if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+ if (op.getSource())
+ args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+ op.getSource(), hasStat, errmsg,
+ sourceFile, sourceLine);
+ else
+ args =
+ fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
+ errmsg, sourceFile, sourceLine);
+ } else {
+ args =
+ fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
+ errmsg, sourceFile, sourceLine);
+ }
auto callOp = builder.create<fir::CallOp>(loc, func, args);
rewriter.replaceOp(op, callOp);
return mlir::success();
@@ -182,10 +199,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Allocation with source will need a new entry point in the runtime.
- if (op.getSource())
- return mlir::failure();
-
// TODO: Allocation using different stream.
if (op.getStream())
return mlir::failure();
@@ -202,18 +215,28 @@ struct CUFAllocateOpConversion
if (hasDoubleDescriptors(op)) {
// Allocation for module variable are done with custom runtime entry point
// so the descriptors can be synchronized.
- mlir::func::FuncOp func =
- fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
- loc, builder);
- return convertOpToCall(op, rewriter, func);
+ mlir::func::FuncOp func;
+ if (op.getSource())
+ func = fir::runtime::getRuntimeFunc<mkRTKey(
+ CUFAllocatableAllocateSourceSync)>(loc, builder);
+ else
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
+ loc, builder);
+ return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
- // Allocation for local descriptor falls back on the standard runtime
- // AllocatableAllocate as the dedicated allocator is set in the descriptor
- // before the call.
- mlir::func::FuncOp func =
- fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
- builder);
+ mlir::func::FuncOp func;
+ if (op.getSource())
+ func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
+ loc, builder);
+ else
+ // Allocation for local descriptor falls back on the standard runtime
+ // AllocatableAllocate as the dedicated allocator is set in the descriptor
+ // before the call.
+ func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+ loc, builder);
+
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
};
@@ -236,7 +259,7 @@ struct CUFDeallocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableDeallocate)>(
loc, builder);
- return convertOpToCall(op, rewriter, func);
+ return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
}
// Deallocation for local descriptor falls back on the standard runtime
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index ce87f3efdc3632..3a88824826de31 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -18,6 +18,7 @@ add_flang_library(${CUFRT_LIBNAME}
allocatable.cpp
descriptor.cpp
kernel.cpp
+ memmove-function.cpp
memory.cpp
registration.cpp
)
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 649ddb638abe6d..9fed50c859a9cf 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -7,10 +7,12 @@
//===----------------------------------------------------------------------===//
#include "flang/Runtime/CUDA/allocatable.h"
+#include "../assign-impl.h"
#include "../stat.h"
#include "../terminator.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
#include "flang/Runtime/allocatable.h"
#include "cuda_runtime.h"
@@ -45,6 +47,32 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
return stat;
}
+int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
+ const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
+ int stat{RTNAME(AllocatableAllocate)(
+ alloc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (stat == StatOk) {
+ Terminator terminator{sourceFile, sourceLine};
+ Fortran::runtime::DoFromSourceAssign(
+ alloc, source, terminator, &MemmoveHostToDevice);
+ }
+ return stat;
+}
+
+int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
+ const Descriptor &source, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
+ int stat{RTNAME(AllocatableAllocate)(
+ alloc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (stat == StatOk) {
+ Terminator terminator{sourceFile, sourceLine};
+ Fortran::runtime::DoFromSourceAssign(
+ alloc, source, terminator, &MemmoveHostToDevice);
+ }
+ return stat;
+}
+
int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
// Perform the standard allocation.
diff --git a/flang/runtime/CUDA/memmove-function.cpp b/flang/runtime/CUDA/memmove-function.cpp
new file mode 100644
index 00000000000000..3ba9fa7e0f7f73
--- /dev/null
+++ b/flang/runtime/CUDA/memmove-function.cpp
@@ -0,0 +1,35 @@
+//===-- runtime/CUDA/memmove-function.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/memmove-function.h"
+#include "../terminator.h"
+#include "flang/Runtime/CUDA/common.h"
+
+#include "cuda_runtime.h"
+
+namespace Fortran::runtime::cuda {
+
+void *MemmoveHostToDevice(void *dst, const void *src, std::size_t count) {
+ // TODO: Use cudaMemcpyAsync when we have support for stream.
+ CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+ return dst;
+}
+
+void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count) {
+ // TODO: Use cudaMemcpyAsync when we have support for stream.
+ CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+ return dst;
+}
+
+void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) {
+ // TODO: Use cudaMemcpyAsync when we have support for stream.
+ CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
+ return dst;
+}
+
+} // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index 68963c4d7738ac..0bbb493d2db919 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -11,31 +11,12 @@
#include "../terminator.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
#include "flang/Runtime/assign.h"
#include "cuda_runtime.h"
namespace Fortran::runtime::cuda {
-static void *MemmoveHostToDevice(
- void *dst, const void *src, std::size_t count) {
- // TODO: Use cudaMemcpyAsync when we have support for stream.
- CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
- return dst;
-}
-
-static void *MemmoveDeviceToHost(
- void *dst, const void *src, std::size_t count) {
- // TODO: Use cudaMemcpyAsync when we have support for stream.
- CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
- return dst;
-}
-
-static void *MemmoveDeviceToDevice(
- void *dst, const void *src, std::size_t count) {
- // TODO: Use cudaMemcpyAsync when we have support for stream.
- CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
- return dst;
-}
extern "C" {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index d68ff894d5af5a..47d75b16b7a2d2 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -120,4 +120,49 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
+func.func @_QPallocate_source() {
+ %c0_i64 = arith.constant 0 : i64
+ %c1_i32 = arith.constant 1 : i32
+ %c0_i32 = arith.constant 0 : i32
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
+ %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %5 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a_d", data_attr = #cuf.cuda<device>, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %7 = fir.declare %5 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %8 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %22 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%8 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+ return
+}
+
+// CHECK-LABEL: func.func @_QPallocate_source()
+// CHECK: %[[DECL_HOST:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[DECL_DEV:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
+
+fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
+ %c0 = arith.constant 0 : index
+ %0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
+ %1 = fir.shape %c0, %c0 : (index, index) -> !fir.shape<2>
+ %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+ fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+}
+func.func @_QMmod1Pallocate_source_global() {
+ %0 = fir.address_of(@_QMmod1Ea_d) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
+ %6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+ return
+}
+
+// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
+
} // end of module
``````````
</details>
https://github.com/llvm/llvm-project/pull/117388
More information about the flang-commits
mailing list