[flang-commits] [flang] [flang][cuda] Materialize constant src in memory (PR #116851)

Valentin Clement バレンタイン クレメン via flang-commits flang-commits at lists.llvm.org
Tue Nov 19 10:01:24 PST 2024


https://github.com/clementval created https://github.com/llvm/llvm-project/pull/116851

When the src of the data transfer is a constant, it needs to be materialized in memory to be able to perform a data transfer. 

```
subroutine sub1()
  real, device :: a(10)
  integer :: I

  do i = 5, 10
    a(i) = -4.0
  end do
end
```

>From 70ddba9c1005659ef714f85ad3921a29aa404f1f Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 19 Nov 2024 09:59:46 -0800
Subject: [PATCH] [flang][cuda] Materialize constant src in memory

---
 .../Optimizer/Transforms/CUFOpConversion.cpp  |  6 +++
 flang/test/Fir/CUDA/cuda-data-transfer.fir    | 40 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 17699dadc7511f..f1ebd08967b9a1 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -628,6 +628,12 @@ struct CUFDataTransferOpConversion
 
       mlir::Value dst = getDeviceAddress(rewriter, op.getDstMutable(), symtab);
       mlir::Value src = getDeviceAddress(rewriter, op.getSrcMutable(), symtab);
+      // Materialize the src if constant.
+      if (matchPattern(src.getDefiningOp(), mlir::m_Constant())) {
+        mlir::Value temp = builder.createTemporary(loc, srcTy);
+        builder.create<fir::StoreOp>(loc, src, temp);
+        src = temp;
+      }
       llvm::SmallVector<mlir::Value> args{
           fir::runtime::createArguments(builder, loc, fTy, dst, src, bytes,
                                         modeValue, sourceFile, sourceLine)};
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index 5f10dc0562d179..0f9ca6e640a802 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -513,4 +513,44 @@ func.func @_QPcallkernel(%arg0: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bind
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<!fir.box<!fir.array<?x?xcomplex<f32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFDataTransferDescDesc(%{{.*}}, %[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
+func.func @_QPsrc_cst() {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "d4", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Ed4"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %5:2 = hlfir.declare %1 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ed4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %6 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsub4Ei"}
+  %7:2 = hlfir.declare %6 {uniq_name = "_QFsub4Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %9 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %c6_i32 = arith.constant 6 : i32
+  %14 = fir.convert %c6_i32 : (i32) -> index
+  %c10_i32_0 = arith.constant 10 : i32
+  %15 = fir.convert %c10_i32_0 : (i32) -> index
+  %c1_1 = arith.constant 1 : index
+  %16 = fir.convert %14 : (index) -> i32
+  %17:2 = fir.do_loop %arg1 = %14 to %15 step %c1_1 iter_args(%arg2 = %16) -> (index, i32) {
+    fir.store %arg2 to %7#1 : !fir.ref<i32>
+    %cst = arith.constant -4.000000e+00 : f32
+    %22 = fir.load %5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %23 = fir.load %7#0 : !fir.ref<i32>
+    %24 = fir.convert %23 : (i32) -> i64
+    %25 = hlfir.designate %22 (%24)  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, i64) -> !fir.ref<f32>
+    cuf.data_transfer %cst to %25 {transfer_kind = #cuf.cuda_transfer<host_device>} : f32, !fir.ref<f32>
+    %26 = arith.addi %arg1, %c1_1 : index
+    %27 = fir.convert %c1_1 : (index) -> i32
+    %28 = fir.load %7#1 : !fir.ref<i32>
+    %29 = arith.addi %28, %27 : i32
+    fir.result %26, %29 : index, i32
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsrc_cst()
+// CHECK: %[[ALLOCA:.*]] = fir.alloca f32
+// CHECK: %[[CST:.*]] = arith.constant -4.000000e+00 : f32
+// CHECK: fir.store %[[CST]] to %[[ALLOCA]] : !fir.ref<f32>
+// CHECK: %[[CONV:.*]] = fir.convert %[[ALLOCA]] : (!fir.ref<f32>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%{{.*}}, %[[CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+
 } // end of module



More information about the flang-commits mailing list