[flang-commits] [flang] [llvm] [flang][cuda] Add async id to allocators (PR #134724)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Mon Apr 7 13:32:46 PDT 2025
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/134724
Add async id to allocators in preparation for stream allocation.
>From 1e8cee2e00d1ef8adbfdd0c3fcd2330043d17d1d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 7 Apr 2025 13:31:22 -0700
Subject: [PATCH] [flang][cuda] Add async id to allocators
---
.../include/flang-rt/runtime/allocator-registry.h | 10 ++++++----
flang-rt/lib/cuda/allocator.cpp | 14 +++++++++-----
flang-rt/lib/cuda/descriptor.cpp | 3 ++-
flang-rt/lib/runtime/descriptor.cpp | 2 +-
flang-rt/lib/runtime/pointer.cpp | 2 +-
flang/include/flang/Runtime/CUDA/allocator.h | 8 ++++----
6 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
index 1a59ec8b1ef5b..33e8e2c7d7850 100644
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -11,6 +11,7 @@
#include "flang/Common/api-attrs.h"
#include "flang/Runtime/allocator-registry-consts.h"
+#include <cstdint>
#include <cstdlib>
#include <vector>
@@ -18,7 +19,7 @@
namespace Fortran::runtime {
-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
using FreeFct = void (*)(void *);
typedef struct Allocator_t {
@@ -26,10 +27,11 @@ typedef struct Allocator_t {
FreeFct free{nullptr};
} Allocator_t;
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+ std::size_t size, [[maybe_unused]] std::int64_t) {
return std::malloc(size);
}
+#ifdef RT_DEVICE_COMPILATION
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
#endif
@@ -39,7 +41,7 @@ struct AllocatorRegistry {
: allocators{{&MallocWrapper, &FreeWrapper}} {}
#else
constexpr AllocatorRegistry() {
- allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+ allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
};
#endif
RT_API_ATTRS void Register(int, Allocator_t);
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index d6529957bc939..d606ab2d4313b 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -34,7 +34,8 @@ void RTDEF(CUFRegisterAllocator)() {
}
}
-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
@@ -42,7 +43,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
CUDA_REPORT_IF_ERROR(
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
// Call alloc managed for the time being.
- return CUFAllocManaged(sizeInBytes);
+ return CUFAllocManaged(sizeInBytes, asyncId);
}
void CUFFreeUnified(void *p) {
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index d44ab2e45d2a8..175e8c0ef8438 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
Descriptor *RTDEF(CUFAllocDescriptor)(
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
- return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+ return reinterpret_cast<Descriptor *>(
+ CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
}
void RTDEF(CUFFreeDescriptor)(
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 495e25e96aded..c660d4f2ff5a0 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
// Zero size allocation is possible in Fortran and the resulting
// descriptor must be allocated/associated. Since std::malloc(0)
// result is implementation defined, always allocate at least one byte.
- void *p{alloc(byteSize ? byteSize : 1)};
+ void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
if (!p) {
return CFI_ERROR_MEM_ALLOCATION;
}
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index 0cd46cd05e2d3..fd2427f4124b5 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
byteSize = ((byteSize + align - 1) / align) * align;
std::size_t total{byteSize + sizeof(std::uintptr_t)};
AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
- void *p{alloc(total)};
+ void *p{alloc(total, /*asyncId=*/-1)};
if (p && allocatorIdx == 0) {
// Fill the footer word with the XOR of the ones' complement of
// the base address, which is a value that would be highly unlikely
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 4fb4c94c5e9b0..18ddf75ac3852 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -20,16 +20,16 @@ extern "C" {
void RTDECL(CUFRegisterAllocator)();
}
-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
void CUFFreePinned(void *);
-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
void CUFFreeDevice(void *);
-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
void CUFFreeManaged(void *);
-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
void CUFFreeUnified(void *);
} // namespace Fortran::runtime::cuda
More information about the flang-commits
mailing list