[llvm] [flang][cuda][rt] Track asynchronous allocation stream for deallocation (PR #137073)
Valentin Clement バレンタイン クレメン via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 23 15:11:16 PDT 2025
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/137073
When an asynchronous allocation is made, we call `cudaMallocAsync` with a stream. For deallocation, we need to call `cudaFreeAsync` with the same stream. in order to achieve that, we need to track the allocation and their respective stream.
This patch adds a simple sorted array of asynchronous allocations. A binary search is performed to retrieve the allocation when deallocation is needed.
>From 0f8a06c6f13a192bf4e5b566e4121ba3b4f05d0e Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Wed, 16 Apr 2025 13:53:22 -0700
Subject: [PATCH] [flang][cuda][rt] Track asynchronous allocation stream for
deallocation
---
flang-rt/lib/cuda/allocator.cpp | 114 +++++++++++++++++-
.../unittests/Runtime/CUDA/Allocatable.cpp | 59 +++++++++
2 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index a1c3a2c1b2ea8..6b85aa0efbb81 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -11,6 +11,7 @@
#include "flang-rt/runtime/derived.h"
#include "flang-rt/runtime/descriptor.h"
#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/lock.h"
#include "flang-rt/runtime/stat.h"
#include "flang-rt/runtime/terminator.h"
#include "flang-rt/runtime/type-info.h"
@@ -21,6 +22,106 @@
#include "cuda_runtime.h"
namespace Fortran::runtime::cuda {
+
+struct DeviceAllocation {
+ void *ptr;
+ std::size_t size;
+ cudaStream_t stream;
+};
+
+// Compare address values. nullptr will be sorted at the end of the array.
+int compareDeviceAlloc(const void *a, const void *b) {
+ const DeviceAllocation *deva = (const DeviceAllocation *)a;
+ const DeviceAllocation *devb = (const DeviceAllocation *)b;
+ if (deva->ptr == nullptr && devb->ptr == nullptr)
+ return 0;
+ if (deva->ptr == nullptr)
+ return 1;
+ if (devb->ptr == nullptr)
+ return -1;
+ return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
+}
+
+// Dynamic array for tracking asynchronous allocations.
+static DeviceAllocation *deviceAllocations = nullptr;
+Lock lock;
+static int maxDeviceAllocations{512}; // Initial size
+static int numDeviceAllocations{0};
+static constexpr int allocNotFound{-1};
+
+static void initAllocations() {
+ if (!deviceAllocations) {
+ deviceAllocations = static_cast<DeviceAllocation *>(
+ malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
+ if (!deviceAllocations) {
+ Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Failed to allocate tracking array");
+ }
+ }
+}
+
+// Double the size of the allocation array when size if
+static void doubleAllocationArray() {
+ unsigned newSize = maxDeviceAllocations * 2;
+ DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
+ realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
+ if (!newArray) {
+ Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Failed to reallocate tracking array");
+ }
+ deviceAllocations = newArray;
+ maxDeviceAllocations = newSize;
+}
+
+static unsigned findAllocation(void *ptr) {
+ if (numDeviceAllocations == 0) {
+ return allocNotFound;
+ }
+
+ int left{0};
+ int right{numDeviceAllocations - 1};
+
+ if (left == right) {
+ return left;
+ }
+
+ while (left <= right) {
+ int mid = left + (right - left) / 2;
+ if (deviceAllocations[mid].ptr == ptr) {
+ return mid;
+ }
+ if (deviceAllocations[mid].ptr < ptr) {
+ left = mid + 1;
+ } else {
+ right = mid - 1;
+ }
+ }
+ return allocNotFound;
+}
+
+static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
+ CriticalSection critical{lock};
+ initAllocations();
+ if (numDeviceAllocations >= maxDeviceAllocations) {
+ doubleAllocationArray();
+ }
+ deviceAllocations[numDeviceAllocations].ptr = ptr;
+ deviceAllocations[numDeviceAllocations].size = size;
+ deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
+ ++numDeviceAllocations;
+ qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
+ compareDeviceAlloc);
+}
+
+static void eraseAllocation(int pos) {
+ deviceAllocations[pos].ptr = nullptr;
+ deviceAllocations[pos].size = 0;
+ deviceAllocations[pos].stream = (cudaStream_t)0;
+ qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
+ compareDeviceAlloc);
+ --numDeviceAllocations;
+}
+
extern "C" {
void RTDEF(CUFRegisterAllocator)() {
@@ -55,12 +156,23 @@ void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
} else {
CUDA_REPORT_IF_ERROR(
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
+ insertAllocation(p, sizeInBytes, asyncId);
}
}
return p;
}
-void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
+void CUFFreeDevice(void *p) {
+ CriticalSection critical{lock};
+ int pos = findAllocation(p);
+ if (pos >= 0) {
+ cudaStream_t stream = deviceAllocations[pos].stream;
+ eraseAllocation(pos);
+ CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
+ } else {
+ CUDA_REPORT_IF_ERROR(cudaFree(p));
+ }
+}
void *CUFAllocManaged(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
diff --git a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index 1c8ded0f87d4e..89649aa95ad93 100644
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -58,3 +58,62 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
EXPECT_EQ(cudaSuccess, cudaGetLastError());
}
+
+TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
+ using Fortran::common::TypeCategory;
+ RTNAME(CUFRegisterAllocator)();
+ // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Real, 4)};
+ a->SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+ EXPECT_FALSE(a->HasAddendum());
+ RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+
+ auto b{createAllocatable(TypeCategory::Real, 4)};
+ b->SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, b->GetAllocIdx());
+ EXPECT_FALSE(b->HasAddendum());
+ RTNAME(AllocatableSetBounds)(*b, 0, 1, 20);
+
+ auto c{createAllocatable(TypeCategory::Real, 4)};
+ c->SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, c->GetAllocIdx());
+ EXPECT_FALSE(b->HasAddendum());
+ RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);
+
+ RTNAME(AllocatableAllocate)
+ (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableAllocate)
+ (*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(b->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableAllocate)
+ (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(c->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableDeallocate)
+ (*b, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(b->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+
+ RTNAME(AllocatableDeallocate)
+ (*c, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(c->IsAllocated());
+ cudaDeviceSynchronize();
+ EXPECT_EQ(cudaSuccess, cudaGetLastError());
+}
More information about the llvm-commits
mailing list