[flang-commits] [flang] [flang][cuda] cuf.allocate: Carry over stream to the runtime call (PR #117631)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Mon Nov 25 13:02:55 PST 2024
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/117631
- Update the runtime entry points to accept a stream information
- Update the conversion of `cuf.allocate` to pass correctly the stream information when present.
Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.
>From a7409dba31c5217841dcebb3982361e6e3fe58b2 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 25 Nov 2024 11:55:34 -0800
Subject: [PATCH] [flang][cuda] cuf.allocate: Carry over stream to the runtime
call
---
.../include/flang/Runtime/CUDA/allocatable.h | 15 ++++---
.../Optimizer/Transforms/CUFOpConversion.cpp | 39 ++++++++++---------
flang/runtime/CUDA/allocatable.cpp | 38 +++++++++++-------
flang/test/Fir/CUDA/cuda-allocate.fir | 26 ++++++++++---
4 files changed, 75 insertions(+), 43 deletions(-)
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
extern "C" {
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
+
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+ builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
- if (op.getSource())
+ if (op.getSource()) {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- op.getSource(), hasStat, errmsg,
- sourceFile, sourceLine);
- else
- args =
- fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
- errmsg, sourceFile, sourceLine);
+ op.getSource(), stream, hasStat,
+ errmsg, sourceFile, sourceLine);
+ } else {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+ args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+ stream, hasStat, errmsg, sourceFile,
+ sourceLine);
+ }
} else {
args =
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Allocation using different stream.
- if (op.getStream())
- return mlir::failure();
-
// TODO: Pinned is a reference to a logical value that can be set to true
// when pinned allocation succeed. This will require a new entry point.
if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFAllocatableAllocateSourceSync)>(loc, builder);
else
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
- loc, builder);
+ func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+ loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
loc, builder);
else
- // Allocation for local descriptor falls back on the standard runtime
- // AllocatableAllocate as the dedicated allocator is set in the descriptor
- // before the call.
- func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- if (desc.HasAddendum()) {
- Terminator terminator{sourceFile, sourceLine};
- // TODO: This require a bit more work to set the correct type descriptor
- // address
- terminator.Crash(
- "not yet implemented: CUDA descriptor allocation with addendum");
- }
- // Perform the standard allocation.
- int stat{RTNAME(AllocatableAllocate)(
- desc, hasStat, errMsg, sourceFile, sourceLine)};
+ int stat{RTNAME(CUFAllocatableAllocate)(
+ desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
return stat;
}
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
+ if (desc.HasAddendum()) {
+ Terminator terminator{sourceFile, sourceLine};
+ // TODO: This require a bit more work to set the correct type descriptor
+ // address
+ terminator.Crash(
+ "not yet implemented: CUDA descriptor allocation with addendum");
+ }
+ // Perform the standard allocation.
+ int stat{RTNAME(AllocatableAllocate)(
+ desc, hasStat, errMsg, sourceFile, sourceLine)};
+ return stat;
+}
+
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
+func.func @_QQallocate_stream() {
+ %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+ %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+ %4 = fir.load %3 : !fir.ref<i64>
+ %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+ return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream()
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
} // end of module
More information about the flang-commits
mailing list