[flang-commits] [flang] [llvm] [flang][cuda] Add support for cudaStreamDestroy (PR #183648)

Thu Feb 26 15:56:47 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

Add specific lowering and entry point for cudaStreamDestroy. Since we keep associated stream for some allocation, we need to reset it when the stream is destroy so we don't use it anymore. 

---
Full diff: https://github.com/llvm/llvm-project/pull/183648.diff


9 Files Affected:

- (modified) flang-rt/lib/cuda/allocator.cpp (+9) 
- (modified) flang-rt/lib/cuda/stream.cpp (+6) 
- (modified) flang-rt/unittests/Runtime/CUDA/Allocatable.cpp (+35) 
- (modified) flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h (+2) 
- (modified) flang/include/flang/Runtime/CUDA/allocator.h (+2) 
- (modified) flang/include/flang/Runtime/CUDA/stream.h (+1) 
- (modified) flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp (+19) 
- (modified) flang/module/cuda_runtime_api.f90 (+8) 
- (modified) flang/test/Lower/CUDA/cuda-default-stream.cuf (+10) 


``````````diff

diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index df7e43de00c70..b66033cb8668e 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -119,6 +119,15 @@ static void eraseAllocation(int pos) {
   --numDeviceAllocations;
 }
 
+void CUFResetStream(cudaStream_t stream) {
+  CriticalSection critical{lock};
+  for (int i = 0; i < numDeviceAllocations; ++i) {
+    if (deviceAllocations[i].stream == stream) {
+      deviceAllocations[i].stream = nullptr;
+    }
+  }
+}
+
 extern "C" {
 
 void RTDEF(CUFRegisterAllocator)() {
diff --git a/flang-rt/lib/cuda/stream.cpp b/flang-rt/lib/cuda/stream.cpp
index 20cf49989e2ed..2ef51b205082b 100644
--- a/flang-rt/lib/cuda/stream.cpp
+++ b/flang-rt/lib/cuda/stream.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/lock.h"
 #include "flang-rt/runtime/stat.h"
 #include "flang-rt/runtime/terminator.h"
+#include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Support/Fortran.h"
 
@@ -37,6 +38,11 @@ int RTDECL(CUFStreamSynchronize)(cudaStream_t stream) {
 int RTDECL(CUFStreamSynchronizeNull)() {
   return cudaStreamSynchronize(RTNAME(CUFGetDefaultStream)());
 }
+
+int RTDECL(CUFStreamDestroy)(cudaStream_t stream) {
+  CUFResetStream(stream);
+  return cudaStreamDestroy(stream);
+}
 }
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 1e98acfd51516..9ca1bac3ec8f1 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -209,3 +209,38 @@ TEST(AllocatableAsyncTest, SetStreamTest) {
   int stat2 = RTDECL(CUFSetAssociatedStream)(b->raw().base_addr, stream);
   EXPECT_EQ(stat2, StatBaseNull);
 }
+
+TEST(AllocatableAsyncTest, DestroyStreamTest) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  RTNAME(AllocatableAllocate)
+  (*a, /*asyncObject=*/(std::int64_t *)&stream, /*hasStat=*/false,
+      /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+  cudaStream_t s = RTNAME(CUFGetAssociatedStream)(a->raw().base_addr);
+  EXPECT_EQ(s, stream);
+
+  RTNAME(CUFStreamDestroy)(stream);
+  s = RTNAME(CUFGetAssociatedStream)(a->raw().base_addr);
+  EXPECT_EQ(s, nullptr);
+
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+  cudaDeviceSynchronize();
+  EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index 5e8fbcd1b93ac..6167a876f7b62 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -65,6 +65,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
       genCUDAStreamSynchronize(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCUDAStreamSynchronizeNull(mlir::Type,
                                            llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genCUDAStreamDestroy(mlir::Type,
+                                          llvm::ArrayRef<fir::ExtendedValue>);
   void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   template <const char *fctName, int extent>
   fir::ExtendedValue genLDXXFunc(mlir::Type,
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 698b979636da4..18907252b575d 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -23,6 +23,8 @@ int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t);
 void RTDECL(CUFRegisterAllocator)();
 }
 
+void CUFResetStream(cudaStream_t stream);
+
 void *CUFAllocPinned(std::size_t, std::int64_t *);
 void CUFFreePinned(void *);
 
diff --git a/flang/include/flang/Runtime/CUDA/stream.h b/flang/include/flang/Runtime/CUDA/stream.h
index 686b828d4e145..7431ac6a27e24 100644
--- a/flang/include/flang/Runtime/CUDA/stream.h
+++ b/flang/include/flang/Runtime/CUDA/stream.h
@@ -23,6 +23,7 @@ int RTDECL(CUFSetDefaultStream)(cudaStream_t);
 cudaStream_t RTDECL(CUFGetDefaultStream)();
 int RTDECL(CUFStreamSynchronize)(cudaStream_t);
 int RTDECL(CUFStreamSynchronizeNull)();
+int RTDECL(CUFStreamDestroy)(cudaStream_t);
 }
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 0403a43a845cd..6d8c2fcf42bdd 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -403,6 +403,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genCUDASetDefaultStream),
      {{{"stream", asValue}}},
      /*isElemental=*/false},
+    {"cudastreamdestroy",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDAStreamDestroy),
+     {{{"stream", asValue}}},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -1161,6 +1166,20 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray(
   return call.getResult(0);
 }
 
+// CUDASTREAMDESTROY
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamDestroy(
+    mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value stream = fir::getBase(args[0]);
+  mlir::Type i64Ty = builder.getI64Type();
+  auto ctx = builder.getContext();
+  mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
+  auto funcOp =
+      builder.createFunction(loc, RTNAME_STRING(CUFStreamDestroy), ftype);
+  auto call = fir::CallOp::create(builder, loc, funcOp, {stream});
+  return call.getResult(0);
+}
+
 // CUDASTREAMSYNCHRONIZE
 fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamSynchronize(
     mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90
index 7c6968cabc373..1e95bcc2f81c9 100644
--- a/flang/module/cuda_runtime_api.f90
+++ b/flang/module/cuda_runtime_api.f90
@@ -36,4 +36,12 @@ integer function cudasetstreamarray(devptr, stream)
   end function
 end interface
 
+interface cudastreamdestroy
+  integer function cudastreamdestroy(stream)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (K) stream
+    integer(kind=cuda_stream_kind), value :: stream
+  end function
+end interface
+
 end module cuda_runtime_api
diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf
index b9fe1f7949595..5fc7de68b47d4 100644
--- a/flang/test/Lower/CUDA/cuda-default-stream.cuf
+++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf
@@ -39,3 +39,13 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @_FortranACUFGetDefaultStream() fastmath<contract> : () -> i64
 ! CHECK: %{{.*}} = fir.call @_FortranACUFGetDefaultStream() fastmath<contract> : () -> i64
 
+subroutine stream_destroy
+  use cuda_runtime_api
+  integer(kind=cuda_stream_kind) :: strm
+  integer :: istat
+  istat = cudaStreamCreate(strm)
+  istat = cudaStreamDestroy(strm)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPstream_destroy()
+! CHECK: %{{.*}} = fir.call @_FortranACUFStreamDestroy(%{{.*}}) fastmath<contract> : (i64) -> i32

``````````

</details>


https://github.com/llvm/llvm-project/pull/183648