[flang] [llvm] [flang][cuda][rt] Add entry point to get the allocation stream (PR #169608)
Valentin Clement バレンタイン クレメン via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 25 21:56:14 PST 2025
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/169608
None
>From ac40b32f7f138f38278ff1f32f9c9a1780de10ef Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 25 Nov 2025 21:55:23 -0800
Subject: [PATCH] [flang][cuda][rt] Add entry point to get the allocation
stream
---
flang-rt/lib/cuda/allocator.cpp | 11 +++-
.../unittests/Runtime/CUDA/Allocatable.cpp | 51 +++++++++++++++++++
flang/include/flang/Runtime/CUDA/allocator.h | 3 ++
3 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 5436051002265..d2aa832883e65 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -19,8 +19,6 @@
#include "flang/Runtime/CUDA/common.h"
#include "flang/Support/Fortran.h"
-#include "cuda_runtime.h"
-
namespace Fortran::runtime::cuda {
struct DeviceAllocation {
@@ -133,6 +131,15 @@ void RTDEF(CUFRegisterAllocator)() {
allocatorRegistry.Register(
kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
}
+
+cudaStream_t RTDECL(CUFAssociatedGetStream)(void *p) {
+ int pos = findAllocation(p);
+ if (pos >= 0) {
+ cudaStream_t stream = deviceAllocations[pos].stream;
+ return stream;
+ }
+ return nullptr;
+}
}
void *CUFAllocPinned(
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 9935ae0eaac2f..4e65326b31a62 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -121,3 +121,54 @@ TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
cudaDeviceSynchronize();
EXPECT_EQ(cudaSuccess, cudaGetLastError());
}
+
+TEST(AllocatableAsyncTest, StreamDeviceAllocatable) {
+ using Fortran::common::TypeCategory;
+ RTNAME(CUFRegisterAllocator)();
+ // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Real, 4)};
+ a->SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+ EXPECT_FALSE(a->HasAddendum());
+ RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableAllocate)
+ (*a, /*asyncObject=*/(int64_t *)&stream, /*hasStat=*/false,
+ /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+ cudaStream_t s = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+ EXPECT_EQ(s, stream);
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+ cudaDeviceSynchronize();
+
+ cudaStream_t defaultStream = 0;
+ RTNAME(AllocatableAllocate)
+ (*a, /*asyncObject=*/(int64_t *)&defaultStream, /*hasStat=*/false,
+ /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+ cudaStream_t d = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+ EXPECT_EQ(d, defaultStream);
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+ cudaDeviceSynchronize();
+
+ RTNAME(AllocatableAllocate)
+ (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+ __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+ cudaStream_t empty = RTDECL(CUFAssociatedGetStream)(a->raw().base_addr);
+ EXPECT_EQ(empty, nullptr);
+}
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 59fdb22b6e663..4e38482a7de30 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -13,11 +13,14 @@
#include "flang/Runtime/descriptor-consts.h"
#include "flang/Runtime/entry-names.h"
+#include "cuda_runtime.h"
+
namespace Fortran::runtime::cuda {
extern "C" {
void RTDECL(CUFRegisterAllocator)();
+cudaStream_t RTDECL(CUFAssociatedGetStream)(void *);
}
void *CUFAllocPinned(std::size_t, std::int64_t *);
More information about the llvm-commits
mailing list