[flang-commits] [flang] [flang][cuda] Use async id for device stream allocation (PR #118733)

Wed Dec 4 18:29:22 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

When stream is specified use cudaMallocAsync with the specified stream

---
Full diff: https://github.com/llvm/llvm-project/pull/118733.diff


3 Files Affected:

- (modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-3) 
- (modified) flang/runtime/CUDA/allocator.cpp (+11-10) 
- (modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+18) 


``````````diff

diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 40423c5ce04885..618da44c675d85 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 
+#include "common.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
@@ -19,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t, std::int64_t);
+void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreePinned(void *);
 
 void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t, std::int64_t);
+void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t, std::int64_t);
+void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index e41ed77e40ff99..d848f1811dcf3f 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,8 +33,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -42,17 +41,20 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  if (stream >= 0) {
+    CUDA_REPORT_IF_ERROR(
+        cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
+  } else {
+    CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  }
   return p;
 }
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -61,10 +63,9 @@ void *CUFAllocManaged(
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 435172890472da..848093939dc57f 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -43,6 +43,24 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   EXPECT_FALSE(a->IsAllocated());
 }
 
+TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+}
+
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();

``````````

</details>


https://github.com/llvm/llvm-project/pull/118733