[flang] [llvm] [flang][cuda] Lower set/get default stream for arrays (PR #181432)

Valentin Clement バレンタイン クレメン via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 13 15:24:43 PST 2026


https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/181432

>From 5b93b1ea263971004ae5431b5d1b8de007d0b9fd Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 13 Feb 2026 13:58:48 -0800
Subject: [PATCH 1/5] [flang][cuda] Lower set/get default stream for arrays

---
 flang-rt/lib/cuda/allocator.cpp               |  6 +--
 .../unittests/Runtime/CUDA/Allocatable.cpp    |  3 +-
 .../Optimizer/Builder/CUDAIntrinsicCall.h     |  6 +++
 flang/include/flang/Runtime/CUDA/allocator.h  |  4 +-
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   | 52 +++++++++++++++++++
 flang/module/cuda_runtime_api.f90             | 39 ++++++++++++++
 flang/test/Lower/CUDA/cuda-default-stream.cuf | 24 +++++++++
 flang/tools/f18/CMakeLists.txt                |  4 +-
 8 files changed, 128 insertions(+), 10 deletions(-)
 create mode 100644 flang/module/cuda_runtime_api.f90
 create mode 100644 flang/test/Lower/CUDA/cuda-default-stream.cuf

diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 917b279b38f3c..df7e43de00c70 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -141,11 +141,9 @@ cudaStream_t RTDECL(CUFGetAssociatedStream)(void *p) {
   return nullptr;
 }
 
-int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  Terminator terminator{sourceFile, sourceLine};
+int RTDECL(CUFSetAssociatedStream)(void *p, cudaStream_t stream) {
   if (p == nullptr) {
-    return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
+    return StatBaseNull;
   }
   int pos = findAllocation(p);
   if (pos >= 0) {
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index e308e8c8bdadb..0680c0086ea73 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -205,7 +205,6 @@ TEST(AllocatableAsyncTest, SetStreamTest) {
 
   // REAL(4), DEVICE, ALLOCATABLE :: b(:) - unallocated, base_addr is null
   auto b{createAllocatable(TypeCategory::Real, 4)};
-  int stat2 = RTDECL(CUFSetAssociatedStream)(
-      b->raw().base_addr, stream, true, nullptr, __FILE__, __LINE__);
+  int stat2 = RTDECL(CUFSetAssociatedStream)(b->raw().base_addr, stream);
   EXPECT_EQ(stat2, StatBaseNull);
 }
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index e9b6e5cf23933..46b6087f899ca 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -51,6 +51,12 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genClusterBlockIndex(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue
+      genCUDAForSetDefaultStreamArray(mlir::Type,
+                                      llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue
+      genCUDAGetDefaultStreamArg(mlir::Type,
+                                 llvm::ArrayRef<fir::ExtendedValue>);
   void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   template <const char *fctName, int extent>
   fir::ExtendedValue genLDXXFunc(mlir::Type,
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 6a64bdeccbc2c..1493383a559c0 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -21,9 +21,7 @@ extern "C" {
 
 void RTDECL(CUFRegisterAllocator)();
 cudaStream_t RTDECL(CUFGetAssociatedStream)(void *);
-int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+int RTDECL(CUFSetAssociatedStream)(void *, cudaStream_t);
 }
 
 void *CUFAllocPinned(std::size_t, std::int64_t *);
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index fe2db4607f86b..63b34016bbb3b 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -19,6 +19,7 @@
 #include "flang/Optimizer/Builder/MutableBox.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -382,6 +383,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genClusterDimBlocks),
      {},
      /*isElemental=*/false},
+    {"cudasetstreamarray",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDAForSetDefaultStreamArray),
+     {{{"devptr", asAddr}, {"stream", asValue}}},
+     /*isElemental=*/false},
+    {"cudastreamgetdefaultarg",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDAGetDefaultStreamArg),
+     {{{"devptr", asAddr}}},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -1103,6 +1114,47 @@ CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
   return res;
 }
 
+// CUDAFORSETSTREAMARRAY
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAForSetDefaultStreamArray(
+    mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg = fir::getBase(args[0]);
+  mlir::Value stream = fir::getBase(args[1]);
+
+  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
+  mlir::Type i64Ty = builder.getI64Type();
+  mlir::Type i32Ty = builder.getI32Type();
+  auto ctx = builder.getContext();
+  mlir::Type voidPtrTy =
+      fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(ctx, {voidPtrTy, i64Ty}, {i32Ty});
+  mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, arg);
+  auto funcOp =
+      builder.createFunction(loc, RTNAME_STRING(CUFSetAssociatedStream), ftype);
+  auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr, stream});
+  return call.getResult(0);
+}
+
+// CUDAGETDEFAULTSTREAMARG
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAGetDefaultStreamArg(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value devptr = fir::getBase(args[0]);
+  mlir::Type i64Ty = builder.getI64Type();
+  mlir::Type i32Ty = builder.getI32Type();
+  auto ctx = builder.getContext();
+  mlir::Type voidPtrTy =
+      fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
+  mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {voidPtrTy}, {i64Ty});
+  mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, devptr);
+  auto funcOp =
+      builder.createFunction(loc, RTNAME_STRING(CUFGetAssociatedStream), ftype);
+  auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr});
+  return call.getResult(0);
+}
+
 // FENCE_PROXY_ASYNC
 void CUDAIntrinsicLibrary::genFenceProxyAsync(
     llvm::ArrayRef<fir::ExtendedValue> args) {
diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90
new file mode 100644
index 0000000000000..b02ae5a49a0e6
--- /dev/null
+++ b/flang/module/cuda_runtime_api.f90
@@ -0,0 +1,39 @@
+!===-- module/cuda_runtime_api.f90 -----------------------------------------===!
+!
+! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+! See https://llvm.org/LICENSE.txt for license information.
+! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!
+!===------------------------------------------------------------------------===!
+
+module cuda_runtime_api
+implicit none
+
+integer, parameter :: cuda_stream_kind = int_ptr_kind()
+
+interface cudaforgetdefaultstream
+    integer(kind=cuda_stream_kind) function cudastreamgetdefaultarg(devptr)
+      import cuda_stream_kind
+      !DIR$ IGNORE_TKR (TKR) devptr
+      integer, device  :: devptr(*)
+    end function
+    integer(kind=cuda_stream_kind) function cudastreamgetdefaultnull()
+      import cuda_stream_kind
+    end function
+end interface
+
+interface cudaforsetdefaultstream
+    integer function cudasetdefaultstream(stream)
+      import cuda_stream_kind
+      !DIR$ IGNORE_TKR (K) stream
+      integer(kind=cuda_stream_kind), value :: stream
+    end function
+    integer function cudasetstreamarray(devptr, stream)
+      import cuda_stream_kind
+      !DIR$ IGNORE_TKR (K) stream, (TKR) devptr
+      integer, device  :: devptr(*)
+      integer(kind=cuda_stream_kind), value :: stream
+    end function
+end interface
+
+end module cuda_runtime_api
diff --git a/flang/test/Lower/CUDA/cuda-default-stream.cuf b/flang/test/Lower/CUDA/cuda-default-stream.cuf
new file mode 100644
index 0000000000000..59c6bc6b70612
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-default-stream.cuf
@@ -0,0 +1,24 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+subroutine associated_stream
+  use cuda_runtime_api
+  integer(kind=cuda_stream_kind) :: strm, strmout
+  integer, managed, allocatable :: v(:)
+  integer :: istat
+
+  istat = cudaforSetDefaultStream(v, strm)
+  strmout = cudaforGetDefaultStream(v)
+  
+end subroutine
+
+! CHECK-LABEL: func.func @_QPassociated_stream()
+! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[STREAM:.*]] = fir.load %{{.*}}#0 : !fir.ref<i64>
+! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK: %[[STAT:.*]] = fir.call @_FortranACUFSetAssociatedStream(%[[VOIDPTR]], %[[STREAM]]) fastmath<contract> : (!fir.llvm_ptr<i8>, i64) -> i32
+! CHECK: hlfir.assign %[[STAT]] to %{{.*}}#0 : i32, !fir.ref<i32>
+
+! CHECK: %[[ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[VOIDPTR:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK: %[[STREAM:.*]] = fir.call @_FortranACUFGetAssociatedStream(%[[VOIDPTR]]) fastmath<contract> : (!fir.llvm_ptr<i8>) -> i64
+! CHECK: hlfir.assign %[[STREAM]] to %{{.*}}#0 : i64, !fir.ref<i64>
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index ffd92f033840b..74b329f6c6c03 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -16,6 +16,7 @@ set(MODULES
   "__cuda_builtins"
   "__cuda_device"
   "cooperative_groups"
+  "cuda_runtime_api"
   "cudadevice"
   "ieee_arithmetic"
   "ieee_exceptions"
@@ -64,7 +65,8 @@ if (NOT CMAKE_CROSSCOMPILING)
       set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
     elseif(${filename} STREQUAL "__cuda_device" OR
            ${filename} STREQUAL "cudadevice" OR
-           ${filename} STREQUAL "cooperative_groups")
+           ${filename} STREQUAL "cooperative_groups" OR
+           ${filename} STREQUAL "cuda_runtime_api")
       set(opts -fc1 -xcuda)
       if(${filename} STREQUAL "__cuda_device")
         set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)

>From 932f3e29554dc5d8a83eacfefe5f5de5eecd5dd4 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 13 Feb 2026 14:26:23 -0800
Subject: [PATCH 2/5] Remove unused var

---
 flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 63b34016bbb3b..f2e14b85b28c2 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1143,7 +1143,6 @@ fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAGetDefaultStreamArg(
   assert(args.size() == 1);
   mlir::Value devptr = fir::getBase(args[0]);
   mlir::Type i64Ty = builder.getI64Type();
-  mlir::Type i32Ty = builder.getI32Type();
   auto ctx = builder.getContext();
   mlir::Type voidPtrTy =
       fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));

>From af31bd3125491130d7e66f574086eb2cf81ca1e8 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 13 Feb 2026 14:49:37 -0800
Subject: [PATCH 3/5] Fix names

---
 .../flang/Optimizer/Builder/CUDAIntrinsicCall.h    |  4 ++--
 flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp  | 14 +++++++-------
 flang/module/cuda_runtime_api.f90                  |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index 46b6087f899ca..d92f0c72dde0d 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -52,8 +52,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   mlir::Value genClusterBlockIndex(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue
-      genCUDAForSetDefaultStreamArray(mlir::Type,
-                                      llvm::ArrayRef<fir::ExtendedValue>);
+      genCUDASetDefaultStreamArray(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue
       genCUDAGetDefaultStreamArg(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index f2e14b85b28c2..dcbf1531664a0 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -383,16 +383,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genClusterDimBlocks),
      {},
      /*isElemental=*/false},
-    {"cudasetstreamarray",
-     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
-         &CI::genCUDAForSetDefaultStreamArray),
-     {{{"devptr", asAddr}, {"stream", asValue}}},
-     /*isElemental=*/false},
-    {"cudastreamgetdefaultarg",
+    {"cudagetstreamdefaultarg",
      static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
          &CI::genCUDAGetDefaultStreamArg),
      {{{"devptr", asAddr}}},
      /*isElemental=*/false},
+    {"cudasetstreamarray",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genCUDASetDefaultStreamArray),
+     {{{"devptr", asAddr}, {"stream", asValue}}},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -1115,7 +1115,7 @@ CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
 }
 
 // CUDAFORSETSTREAMARRAY
-fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAForSetDefaultStreamArray(
+fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray(
     mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
   mlir::Value arg = fir::getBase(args[0]);
diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90
index b02ae5a49a0e6..0ad7ed0ebf5b5 100644
--- a/flang/module/cuda_runtime_api.f90
+++ b/flang/module/cuda_runtime_api.f90
@@ -12,7 +12,7 @@ module cuda_runtime_api
 integer, parameter :: cuda_stream_kind = int_ptr_kind()
 
 interface cudaforgetdefaultstream
-    integer(kind=cuda_stream_kind) function cudastreamgetdefaultarg(devptr)
+    integer(kind=cuda_stream_kind) function cudagetstreamdefaultarg(devptr)
       import cuda_stream_kind
       !DIR$ IGNORE_TKR (TKR) devptr
       integer, device  :: devptr(*)

>From 433f1c44165cece2c050436a70f3e081eb7e147d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 13 Feb 2026 15:17:39 -0800
Subject: [PATCH 4/5] Typo

---
 flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index dcbf1531664a0..4c4403dcd71a9 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1114,7 +1114,7 @@ CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
   return res;
 }
 
-// CUDAFORSETSTREAMARRAY
+// CUDASETSTREAMARRAY
 fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray(
     mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);

>From 9bfb19eebb58df96531b4af7f8238d688e090f79 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 13 Feb 2026 15:24:30 -0800
Subject: [PATCH 5/5] Format

---
 flang/module/cuda_runtime_api.f90 | 38 +++++++++++++++----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/flang/module/cuda_runtime_api.f90 b/flang/module/cuda_runtime_api.f90
index 0ad7ed0ebf5b5..d6cb6d8c0f715 100644
--- a/flang/module/cuda_runtime_api.f90
+++ b/flang/module/cuda_runtime_api.f90
@@ -12,28 +12,28 @@ module cuda_runtime_api
 integer, parameter :: cuda_stream_kind = int_ptr_kind()
 
 interface cudaforgetdefaultstream
-    integer(kind=cuda_stream_kind) function cudagetstreamdefaultarg(devptr)
-      import cuda_stream_kind
-      !DIR$ IGNORE_TKR (TKR) devptr
-      integer, device  :: devptr(*)
-    end function
-    integer(kind=cuda_stream_kind) function cudastreamgetdefaultnull()
-      import cuda_stream_kind
-    end function
+  integer(kind=cuda_stream_kind) function cudagetstreamdefaultarg(devptr)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (TKR) devptr
+    integer, device  :: devptr(*)
+  end function
+  integer(kind=cuda_stream_kind) function cudastreamgetdefaultnull()
+    import cuda_stream_kind
+  end function
 end interface
 
 interface cudaforsetdefaultstream
-    integer function cudasetdefaultstream(stream)
-      import cuda_stream_kind
-      !DIR$ IGNORE_TKR (K) stream
-      integer(kind=cuda_stream_kind), value :: stream
-    end function
-    integer function cudasetstreamarray(devptr, stream)
-      import cuda_stream_kind
-      !DIR$ IGNORE_TKR (K) stream, (TKR) devptr
-      integer, device  :: devptr(*)
-      integer(kind=cuda_stream_kind), value :: stream
-    end function
+  integer function cudasetdefaultstream(stream)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (K) stream
+    integer(kind=cuda_stream_kind), value :: stream
+  end function
+  integer function cudasetstreamarray(devptr, stream)
+    import cuda_stream_kind
+    !DIR$ IGNORE_TKR (K) stream, (TKR) devptr
+    integer, device  :: devptr(*)
+    integer(kind=cuda_stream_kind), value :: stream
+  end function
 end interface
 
 end module cuda_runtime_api



More information about the llvm-commits mailing list