[flang-commits] [flang] [flang][cuda] Introduce stream cast op (PR #136050)

Wed Apr 16 15:46:26 PDT 2025

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/136050

>From ee803f7439288419d0ed03f7acb266c91ae8e15d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 16 Apr 2025 15:36:54 -0700
Subject: [PATCH 1/2] [flang][cuda] Introduce stream cast op

---
 .../flang/Optimizer/Dialect/CUF/CUFOps.td     | 22 +++++++++++++++++++
 .../include/flang/Optimizer/Support/InitFIR.h |  2 +-
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp    | 11 ++++++++++
 flang/test/Fir/CUDA/cuda-stream.mlir          | 21 ++++++++++++++++++
 flang/tools/fir-opt/fir-opt.cpp               |  2 --
 5 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Fir/CUDA/cuda-stream.mlir

diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index feef5485194f8..595469398a34b 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -18,6 +18,7 @@ include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
 include "flang/Optimizer/Dialect/FIRAttr.td"
+include "mlir/Dialect/GPU/IR/GPUBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/BuiltinAttributes.td"
@@ -370,4 +371,25 @@ def cuf_SharedMemoryOp
       CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
 }
 
+def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
+  let summary = "Adapt a stream value to a GPU async token";
+
+  let description = [{
+    Cast a stream object reference as a GPU async token. This is useful to be
+    able to connect the stream representation of CUDA Fortran and the async
+    mechnism of the GPU dialect.
+    Later in the lowering this will becoming a no op.
+  }];
+
+  let arguments = (ins fir_ReferenceType:$stream);
+
+  let results = (outs GPU_AsyncToken:$token);
+
+  let assemblyFormat = [{
+    $stream attr-dict `:` type($stream)
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 4c57e01c28c93..1868fbb201970 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -40,7 +40,7 @@ namespace fir::support {
       mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,                   \
       mlir::vector::VectorDialect, mlir::math::MathDialect,                    \
       mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect,       \
-      mlir::NVVM::NVVMDialect
+      mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect
 
 #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect
 
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 957e4c01fb4a1..ce197d48d4860 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -319,6 +319,17 @@ void cuf::SharedMemoryOp::build(
   result.addAttributes(attributes);
 }
 
+//===----------------------------------------------------------------------===//
+// StreamCastOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult cuf::StreamCastOp::verify() {
+  auto refTy = mlir::dyn_cast<fir::ReferenceType>(getStream().getType());
+  if (!refTy.getEleTy().isInteger(64))
+    return emitOpError("stream is expected to be a i64 reference");
+  return mlir::success();
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/test/Fir/CUDA/cuda-stream.mlir b/flang/test/Fir/CUDA/cuda-stream.mlir
new file mode 100644
index 0000000000000..50f230467854b
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-stream.mlir
@@ -0,0 +1,21 @@
+// RUN: fir-opt --split-input-file %s | FileCheck %s
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMmod1Psub1() kernel {
+      gpu.return
+    }
+  }
+  func.func @_QMmod1Phost_sub() {
+    %0 = fir.alloca i64
+    %1 = arith.constant 1 : index
+    %asyncTok = cuf.stream_cast %0 : !fir.ref<i64>
+    gpu.launch_func [%asyncTok] @cuda_device_mod::@_QMmod1Psub1 blocks in (%1, %1, %1) threads in (%1, %1, %1) args() {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QMmod1Phost_sub()
+// CHECK: %[[STREAM:.*]] = fir.alloca i64
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
+// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index ef510ff77ad25..d66fc3f08bdf8 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -44,8 +44,6 @@ int main(int argc, char **argv) {
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
-  registry.insert<mlir::gpu::GPUDialect>();
-  registry.insert<mlir::NVVM::NVVMDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));

>From df9e0ca6230d80a2d18bdb53978166a878832fff Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 16 Apr 2025 15:46:16 -0700
Subject: [PATCH 2/2] Typo

---
 flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 595469398a34b..7cfdebd9bfd39 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -377,7 +377,7 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
   let description = [{
     Cast a stream object reference as a GPU async token. This is useful to be
     able to connect the stream representation of CUDA Fortran and the async
-    mechnism of the GPU dialect.
+    mechanism of the GPU dialect.
     Later in the lowering this will becoming a no op.
   }];