[flang-commits] [flang] [llvm] [flang][cuda] Add async id to allocators (PR #134724)

Mon Apr 7 13:32:46 PDT 2025

https://github.com/clementval created https://github.com/llvm/llvm-project/pull/134724

Add async id to allocators in preparation for stream allocation. 

>From 1e8cee2e00d1ef8adbfdd0c3fcd2330043d17d1d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 7 Apr 2025 13:31:22 -0700
Subject: [PATCH] [flang][cuda] Add async id to allocators

---
 .../include/flang-rt/runtime/allocator-registry.h  | 10 ++++++----
 flang-rt/lib/cuda/allocator.cpp                    | 14 +++++++++-----
 flang-rt/lib/cuda/descriptor.cpp                   |  3 ++-
 flang-rt/lib/runtime/descriptor.cpp                |  2 +-
 flang-rt/lib/runtime/pointer.cpp                   |  2 +-
 flang/include/flang/Runtime/CUDA/allocator.h       |  8 ++++----
 6 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
index 1a59ec8b1ef5b..33e8e2c7d7850 100644
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -11,6 +11,7 @@
 
 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/allocator-registry-consts.h"
+#include <cstdint>
 #include <cstdlib>
 #include <vector>
 
@@ -18,7 +19,7 @@
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
@@ -26,10 +27,11 @@ typedef struct Allocator_t {
   FreeFct free{nullptr};
 } Allocator_t;
 
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+    std::size_t size, [[maybe_unused]] std::int64_t) {
   return std::malloc(size);
 }
+#ifdef RT_DEVICE_COMPILATION
 static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
 #endif
 
@@ -39,7 +41,7 @@ struct AllocatorRegistry {
       : allocators{{&MallocWrapper, &FreeWrapper}} {}
 #else
   constexpr AllocatorRegistry() {
-    allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+    allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
   };
 #endif
   RT_API_ATTRS void Register(int, Allocator_t);
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index d6529957bc939..d606ab2d4313b 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -34,7 +34,8 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -42,7 +43,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes);
+  return CUFAllocManaged(sizeInBytes, asyncId);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index d44ab2e45d2a8..175e8c0ef8438 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
 
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
-  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+  return reinterpret_cast<Descriptor *>(
+      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
 }
 
 void RTDEF(CUFFreeDescriptor)(
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 495e25e96aded..c660d4f2ff5a0 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1)};
+  void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index 0cd46cd05e2d3..fd2427f4124b5 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
   byteSize = ((byteSize + align - 1) / align) * align;
   std::size_t total{byteSize + sizeof(std::uintptr_t)};
   AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
-  void *p{alloc(total)};
+  void *p{alloc(total, /*asyncId=*/-1)};
   if (p && allocatorIdx == 0) {
     // Fill the footer word with the XOR of the ones' complement of
     // the base address, which is a value that would be highly unlikely
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 4fb4c94c5e9b0..18ddf75ac3852 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -20,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
 void CUFFreePinned(void *);
 
-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda