[flang-commits] [flang] [flang][cuda] Widen stream argument to i64 in stream intrinsic lowering (PR #196650)

Fri May 8 14:52:47 PDT 2026

https://github.com/wangzpgi created https://github.com/llvm/llvm-project/pull/196650

`genCUDASetDefaultStream` and `genCUDAStreamDestroy` build their runtime call with an `i64` stream parameter but pass the actual argument straight through, so a smaller-kind actual (e.g. the literal `0` in `cudaforSetDefaultStream(0)`) produces an ill-typed `fir.call`:
```
error: 'llvm.call' op operand type mismatch for operand 0: 'i32' != 'i64'
```
Insert a `fir.convert` to `i64` before the call, matching what `genCUDASetDefaultStreamArray` already does.

>From dc8c23c9f476a45a47eaf0ee626bb27971627730 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Fri, 8 May 2026 14:50:17 -0700
Subject: [PATCH] Widen stream argument to i64 in stream intrinsic lowering

---
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   |  3 ++
 flang/test/Lower/CUDA/cuda-default-stream.cuf | 34 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index b53294b68ac92..bc95d7d2893a7 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1135,6 +1135,8 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStream(
   assert(args.size() == 1);
   mlir::Value stream = fir::getBase(args[0]);
   mlir::Type i64Ty = builder.getI64Type();
+  // Widen to i64 to accept smaller integer-kind actuals (e.g. literal 0).
+  stream = builder.createConvert(loc, i64Ty, stream);
   auto ctx = builder.getContext();
   mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
   auto funcOp =
@@ -1172,6 +1174,7 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamDestroy(
   assert(args.size() == 1);
   mlir::Value stream = fir::getBase(args[0]);
   mlir::Type i64Ty = builder.getI64Type();
+  stream = builder.createConvert(loc, i64Ty, stream);
   auto ctx = builder.getContext();
   mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
   auto funcOp =
diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf
index 5fc7de68b47d4..af09604865431 100644
--- a/flang/test/Lower/CUDA/cuda-default-stream.cuf
+++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf
@@ -49,3 +49,37 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPstream_destroy()
 ! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%{{.*}}) fastmath<contract> : (i64) -> i32
+
+! A default-kind (i32) actual argument must be widened to i64 before
+! reaching the runtime stream call.
+subroutine default_stream_i32_literal
+  use cuda_runtime_api
+  integer :: istat
+  istat = cudaforSetDefaultStream(0)
+  istat = cudaStreamDestroy(0)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdefault_stream_i32_literal()
+! CHECK: %[[ZERO1:.*]] = arith.constant 0 : i32
+! CHECK: %[[STRM1:.*]] = fir.convert %[[ZERO1]] : (i32) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranACUFSetDefaultStream(%[[STRM1]]) fastmath<contract> : (i64) -> i32
+! CHECK: %[[ZERO2:.*]] = arith.constant 0 : i32
+! CHECK: %[[STRM2:.*]] = fir.convert %[[ZERO2]] : (i32) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%[[STRM2]]) fastmath<contract> : (i64) -> i32
+
+subroutine default_stream_i32_var
+  use cuda_runtime_api
+  integer :: istat
+  integer(4) :: s
+  s = 0
+  istat = cudaforSetDefaultStream(s)
+  istat = cudaStreamDestroy(s)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdefault_stream_i32_var()
+! CHECK: %[[L1:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[V1:.*]] = fir.convert %[[L1]] : (i32) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranACUFSetDefaultStream(%[[V1]]) fastmath<contract> : (i64) -> i32
+! CHECK: %[[L2:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[V2:.*]] = fir.convert %[[L2]] : (i32) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%[[V2]]) fastmath<contract> : (i64) -> i32