[flang-commits] [flang] 4e6745c - [flang][cuda] Lower simple host to device data transfer (#85960)
via flang-commits
flang-commits at lists.llvm.org
Mon Mar 25 11:53:43 PDT 2024
Author: Valentin Clement (バレンタイン クレメン)
Date: 2024-03-25T11:53:39-07:00
New Revision: 4e6745cc4db309c0e1b5e41d4598f67763f4c096
URL: https://github.com/llvm/llvm-project/commit/4e6745cc4db309c0e1b5e41d4598f67763f4c096
DIFF: https://github.com/llvm/llvm-project/commit/4e6745cc4db309c0e1b5e41d4598f67763f4c096.diff
LOG: [flang][cuda] Lower simple host to device data transfer (#85960)
In CUDA Fortran data transfer can be done via assignment statements
between host and device variables.
This patch introduces a `fir.cuda_data_transfer` operation that
materialized the data transfer between two memory references.
Simple transfer not involving descriptors from host to device are also
lowered in this patch. When the rhs is an expression that required an
evaluation, a temporary is created. The evaluation is done on the host
and then the transfer is initiated.
Implicit transfer when device symbol are present on the rhs is not part
of this patch. Transfer from device to host is not part of this patch.
Added:
flang/test/Lower/CUDA/cuda-data-transfer.cuf
Modified:
flang/include/flang/Optimizer/Dialect/FIRAttr.td
flang/include/flang/Optimizer/Dialect/FIROps.td
flang/lib/Lower/Bridge.cpp
flang/lib/Optimizer/Dialect/FIRAttr.cpp
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 2ac4af9e66aa80..f8b3fb861cc62f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -137,4 +137,20 @@ def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
let assemblyFormat = "`<` struct(params) `>`";
}
+def fir_CUDADataTransferKind : I32EnumAttr<
+ "CUDADataTransferKind", "CUDA Fortran data transfer kind",
+ [
+ I32EnumAttrCase<"DeviceHost", 0, "device_host">,
+ I32EnumAttrCase<"HostDevice", 1, "host_device">,
+ I32EnumAttrCase<"DeviceDevice", 2, "device_device">,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::fir";
+}
+
+def fir_CUDADataTransferKindAttr :
+ EnumAttr<FIROpsDialect, fir_CUDADataTransferKind, "cuda_transfer"> {
+ let assemblyFormat = [{ ```<` $value `>` }];
+}
+
#endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index b991ec76fdd956..dff1cdb20cbfef 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3165,4 +3165,29 @@ def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
let hasVerifier = 1;
}
+def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
+ let summary = "Represent a data transfer between host and device memory";
+
+ let description = [{
+ CUDA Fortran allows data transfer to be done via intrinsic assignment
+ between a host and a device variable. This operation is used to materialized
+ the data transfer between the lhs and rhs memory references.
+ The kind of transfer is specified in the attribute.
+
+ ```
+ adev = a ! transfer host to device
+ a = adev ! transfer device to host
+ bdev = adev ! transfer device to device
+ ```
+ }];
+
+ let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
+ Arg<AnyReferenceLike, "", [MemRead]>:$dst,
+ fir_CUDADataTransferKindAttr:$transfer_kind);
+
+ let assemblyFormat = [{
+ $src `to` $dst attr-dict `:` type(operands)
+ }];
+}
+
#endif
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 0b54ee818e3cd9..48830dc55578c2 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3706,15 +3706,39 @@ class FirConverter : public Fortran::lower::AbstractConverter {
return false;
}
+ static void genCUDADataTransfer(fir::FirOpBuilder &builder,
+ mlir::Location loc, bool lhsIsDevice,
+ hlfir::Entity &lhs, bool rhsIsDevice,
+ hlfir::Entity &rhs) {
+ if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
+ TODO(loc, "CUDA data transfler with descriptors");
+ if (lhsIsDevice && !rhsIsDevice) {
+ auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
+ builder.getContext(), fir::CUDADataTransferKind::HostDevice);
+ // device = host
+ if (!rhs.isVariable()) {
+ auto associate = hlfir::genAssociateExpr(
+ loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
+ builder.create<fir::CUDADataTransferOp>(loc, associate.getBase(), lhs,
+ transferKindAttr);
+ builder.create<hlfir::EndAssociateOp>(loc, associate);
+ } else {
+ builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
+ transferKindAttr);
+ }
+ return;
+ }
+ TODO(loc, "Assignement with CUDA Fortran variables");
+ }
+
void genDataAssignment(
const Fortran::evaluate::Assignment &assign,
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- if (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
- Fortran::evaluate::HasCUDAAttrs(assign.rhs))
- TODO(loc, "Assignement with CUDA Fortran variables");
+ bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
+ bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
// Gather some information about the assignment that will impact how it is
// lowered.
@@ -3772,9 +3796,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
Fortran::lower::StatementContext localStmtCtx;
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
- builder.create<hlfir::AssignOp>(loc, rhs, lhs,
- isWholeAllocatableAssignment,
- keepLhsLengthInAllocatableAssignment);
+ if (lhsIsDevice || rhsIsDevice) {
+ genCUDADataTransfer(builder, loc, lhsIsDevice, lhs, rhsIsDevice, rhs);
+ } else {
+ builder.create<hlfir::AssignOp>(loc, rhs, lhs,
+ isWholeAllocatableAssignment,
+ keepLhsLengthInAllocatableAssignment);
+ }
return;
}
// Assignments inside Forall, Where, or assignments to a vector subscripted
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 0cf8dfb9f784c3..e43710f5627ee0 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -299,5 +299,6 @@ void FIROpsDialect::registerAttributes() {
addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
- CUDALaunchBoundsAttr, CUDAClusterDimsAttr>();
+ CUDALaunchBoundsAttr, CUDAClusterDimsAttr,
+ CUDADataTransferKindAttr>();
}
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
new file mode 100644
index 00000000000000..54226b8623e6a9
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -0,0 +1,57 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran data transfer using assignment statements.
+
+subroutine sub1()
+ integer, device :: m
+ integer, device :: adev(10)
+ integer :: i, ahost(10), bhost(10)
+
+ m = 1 + i
+
+ m = 1
+
+ adev = ahost
+
+ adev = ahost + 1
+
+ adev(1:5) = ahost(1:5)
+
+ adev = ahost + bhost
+
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+
+! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[LOADED_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK: %[[ADD:.*]] = arith.addi %[[C1]], %[[LOADED_I]] : i32
+! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ADD]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
+
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[C1]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
+
+! CHECK: fir.cuda_data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+
+! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
+! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
+! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
+
+! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
+! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
+! CHECK: fir.cuda_data_transfer %[[DES_AHOST]] to %[[DES_ADEV]] {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
+
+! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
+! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
+! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
More information about the flang-commits
mailing list