[flang-commits] [flang] [flang][cuda] cuf.allocate: Carry over stream to the runtime call (PR #117631)

Mon Nov 25 13:03:32 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

- Update the runtime entry points to accept a stream information
- Update the conversion of `cuf.allocate` to pass correctly the stream information when present. 

Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators. 

---
Full diff: https://github.com/llvm/llvm-project/pull/117631.diff


4 Files Affected:

- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+10-5) 
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+21-18) 
- (modified) flang/runtime/CUDA/allocatable.cpp (+23-15) 
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+21-5) 


``````````diff

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
-    if (op.getSource())
+    if (op.getSource()) {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), hasStat, errmsg,
-                                           sourceFile, sourceLine);
-    else
-      args =
-          fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
-                                        errmsg, sourceFile, sourceLine);
+                                           op.getSource(), stream, hasStat,
+                                           errmsg, sourceFile, sourceLine);
+    } else {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+                                           stream, hasStat, errmsg, sourceFile,
+                                           sourceLine);
+    }
   } else {
     args =
         fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Allocation using different stream.
-    if (op.getStream())
-      return mlir::failure();
-
     // TODO: Pinned is a reference to a logical value that can be set to true
     // when pinned allocation succeed. This will require a new entry point.
     if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
         func = fir::runtime::getRuntimeFunc<mkRTKey(
             CUFAllocatableAllocateSourceSync)>(loc, builder);
       else
-        func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
-            loc, builder);
+        func =
+            fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+                loc, builder);
       return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
     }
 
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
           fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
               loc, builder);
     else
-      // Allocation for local descriptor falls back on the standard runtime
-      // AllocatableAllocate as the dedicated allocator is set in the descriptor
-      // before the call.
-      func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+      func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
           loc, builder);
 
     return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  if (desc.HasAddendum()) {
-    Terminator terminator{sourceFile, sourceLine};
-    // TODO: This require a bit more work to set the correct type descriptor
-    // address
-    terminator.Crash(
-        "not yet implemented: CUDA descriptor allocation with addendum");
-  }
-  // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(
-      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(
+      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   return stat;
 }
 
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  return stat;
+}
+
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
 // CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
 // CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
 
+func.func @_QQallocate_stream() {
+  %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+  %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+  %4 = fir.load %3 : !fir.ref<i64>
+  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream() 
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
 } // end of module

``````````

</details>


https://github.com/llvm/llvm-project/pull/117631