[flang-commits] [flang] [flang][cuda] Use async id for device stream allocation (PR #118733)
via flang-commits
flang-commits at lists.llvm.org
Wed Dec 4 18:29:22 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-runtime
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
When stream is specified use cudaMallocAsync with the specified stream
---
Full diff: https://github.com/llvm/llvm-project/pull/118733.diff
3 Files Affected:
- (modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-3)
- (modified) flang/runtime/CUDA/allocator.cpp (+11-10)
- (modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+18)
``````````diff
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 40423c5ce04885..618da44c675d85 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -9,6 +9,7 @@
#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
#define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
+#include "common.h"
#include "flang/Runtime/descriptor.h"
#include "flang/Runtime/entry-names.h"
@@ -19,16 +20,16 @@ extern "C" {
void RTDECL(CUFRegisterAllocator)();
}
-void *CUFAllocPinned(std::size_t, std::int64_t);
+void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
void CUFFreePinned(void *);
void *CUFAllocDevice(std::size_t, std::int64_t);
void CUFFreeDevice(void *);
-void *CUFAllocManaged(std::size_t, std::int64_t);
+void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
void CUFFreeManaged(void *);
-void *CUFAllocUnified(std::size_t, std::int64_t);
+void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
void CUFFreeUnified(void *);
} // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index e41ed77e40ff99..d848f1811dcf3f 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,8 +33,7 @@ void RTDEF(CUFRegisterAllocator)() {
}
}
-void *CUFAllocPinned(
- std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
@@ -42,17 +41,20 @@ void *CUFAllocPinned(
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
-void *CUFAllocDevice(
- std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
void *p;
- CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+ if (stream >= 0) {
+ CUDA_REPORT_IF_ERROR(
+ cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
+ } else {
+ CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+ }
return p;
}
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocManaged(
- std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -61,10 +63,9 @@ void *CUFAllocManaged(
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocUnified(
- std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
// Call alloc managed for the time being.
- return CUFAllocManaged(sizeInBytes, asyncId);
+ return CUFAllocManaged(sizeInBytes);
}
void CUFFreeUnified(void *p) {
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 435172890472da..848093939dc57f 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -43,6 +43,24 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
EXPECT_FALSE(a->IsAllocated());
}
+TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
+ using Fortran::common::TypeCategory;
+ RTNAME(CUFRegisterAllocator)();
+ // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Real, 4)};
+ a->SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+ EXPECT_FALSE(a->HasAddendum());
+ RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+ RTNAME(AllocatableAllocate)
+ (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+ __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+}
+
TEST(AllocatableCUFTest, SimplePinnedAllocate) {
using Fortran::common::TypeCategory;
RTNAME(CUFRegisterAllocator)();
``````````
</details>
https://github.com/llvm/llvm-project/pull/118733
More information about the flang-commits
mailing list