[llvm] [Offload] Allow allocations to be freed without knowing their type (PR #140889)

Wed May 21 05:53:41 PDT 2025

https://github.com/callumfare created https://github.com/llvm/llvm-project/pull/140889

In liboffload we want users to be able to free memory with `olMemFree` without knowing the allocation's type. Currently we track every allocation's associated device and type in a map, and we'd like to remove that workaround. This removes the allocation type from the tracked info.

- Add an overload of `GenericDeviceTy::dataDelete` that doesn't take a `TargetAllocTy`
- Remove the kind argument from `DeviceAllocatorTy::free`
- In the AMDGPU plugin, we can free allocations like this already. Make the relevant method static and remove the code to determine the memory pool as it never actually mattered
- In the CUDA plugin, determine what type of memory an allocation is with `cuPointerGetAttributes`
- Split out `free_non_blocking` from `free` in `DeviceAllocatorTy` so non-blocking device allocations can still be freed

>From ff6cd4ac539e9cb7fe567419975a11ac446127f2 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 21 May 2025 13:11:43 +0100
Subject: [PATCH] [Offload] Allow allocations to be freed without knowing their
 type

---
 offload/liboffload/src/OffloadImpl.cpp        |  7 +-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 32 ++------
 .../common/include/MemoryManager.h            |  6 +-
 .../common/include/PluginInterface.h          |  4 +
 .../common/src/PluginInterface.cpp            | 25 ++++++-
 offload/plugins-nextgen/common/src/RPC.cpp    |  5 +-
 .../cuda/dynamic_cuda/cuda.cpp                |  1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  | 15 ++++
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 73 ++++++++++++++-----
 offload/plugins-nextgen/host/src/rtl.cpp      |  4 +-
 10 files changed, 118 insertions(+), 54 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index ea65282e3ba52..e9c83c2ecbce2 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -89,7 +89,6 @@ namespace offload {
 
 struct AllocInfo {
   ol_device_handle_t Device;
-  ol_alloc_type_t Type;
 };
 
 using AllocInfoMapT = DenseMap<void *, AllocInfo>;
@@ -314,7 +313,7 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
     return ol_impl_result_t::fromError(Alloc.takeError());
 
   *AllocationOut = *Alloc;
-  allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type});
+  allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device});
   return OL_SUCCESS;
 }
 
@@ -324,10 +323,8 @@ ol_impl_result_t olMemFree_impl(void *Address) {
 
   auto AllocInfo = allocInfoMap().at(Address);
   auto Device = AllocInfo.Device;
-  auto Type = AllocInfo.Type;
 
-  auto Res =
-      Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type));
+  auto Res = Device->Device->dataDelete(Address);
   if (Res)
     return ol_impl_result_t::fromError(std::move(Res));
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 2733796611d9b..691918da1e8b1 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -312,7 +312,7 @@ struct AMDGPUMemoryPoolTy {
   }
 
   /// Return memory to the memory pool.
-  Error deallocate(void *Ptr) {
+  static Error deallocate(void *Ptr) {
     hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
     return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s");
   }
@@ -444,7 +444,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
   void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
 
   /// Deallocation callback that will be called by the memory manager.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr) override {
     if (auto Err = MemoryPool->deallocate(TgtPtr)) {
       consumeError(std::move(Err));
       return OFFLOAD_FAIL;
@@ -452,6 +452,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
     return OFFLOAD_SUCCESS;
   }
 
+  int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
   /// The underlying plugin that owns this memory manager.
   AMDGPUPluginTy &Plugin;
 
@@ -2219,31 +2221,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
-    AMDGPUMemoryPoolTy *MemoryPool = nullptr;
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
-      MemoryPool = CoarseGrainedMemoryPools[0];
-      break;
-    case TARGET_ALLOC_HOST:
-      MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-      break;
-    case TARGET_ALLOC_SHARED:
-      MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-      break;
-    }
-
-    if (!MemoryPool) {
-      REPORT("No memory pool for the specified allocation kind\n");
-      return OFFLOAD_FAIL;
-    }
-
-    if (Error Err = MemoryPool->deallocate(TgtPtr)) {
+    if (Error Err = AMDGPUMemoryPoolTy::deallocate(TgtPtr)) {
       REPORT("%s\n", toString(std::move(Err)).data());
       return OFFLOAD_FAIL;
     }
@@ -2251,6 +2233,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return OFFLOAD_SUCCESS;
   }
 
+  int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     AMDGPUStreamTy *Stream =
diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h
index a4f6e628c403a..3e7102d5dc8bc 100644
--- a/offload/plugins-nextgen/common/include/MemoryManager.h
+++ b/offload/plugins-nextgen/common/include/MemoryManager.h
@@ -36,7 +36,11 @@ class DeviceAllocatorTy {
                          TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
 
   /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+  virtual int free(void *TgtPtr) = 0;
+
+  /// Delete the pointer \p TgtPtr, of type TARGET_ALLOC_DEVICE_NON_BLOCKING,
+  /// on the device.
+  virtual int free_non_blocking(void *TgtPtr) = 0;
 };
 
 /// Class of memory manager. The memory manager is per-device by using
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index fa13e26bc3483..dbd71efd868ca 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -771,6 +771,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Deallocate data from the device or involving the device.
   Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
 
+  /// Deallocate data from the device or involving the device, where the
+  /// allocation type is unknown.
+  Error dataDelete(void *TgtPtr);
+
   /// Pin host memory to optimize transfers and return the device accessible
   /// pointer that devices should use for memory transfers involving the host
   /// pinned allocation.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9e316adad8f4..0423c97fd9645 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1479,7 +1479,7 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
     [[fallthrough]];
   case TARGET_ALLOC_HOST:
   case TARGET_ALLOC_SHARED:
-    Res = free(TgtPtr, Kind);
+    Res = free(TgtPtr);
     if (Res)
       return Plugin::error(
           ErrorCode::UNKNOWN,
@@ -1495,6 +1495,29 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
   return Plugin::success();
 }
 
+Error GenericDeviceTy::dataDelete(void *TgtPtr) {
+  // Try to free via the memory manager if it exists. If the allocation isn't
+  // managed by the memory manager it will fall back to the device allocator
+  // which should succeed.
+  int Res;
+  if (MemoryManager) {
+    Res = MemoryManager->free(TgtPtr);
+    if (Res)
+      return Plugin::error(
+          ErrorCode::UNKNOWN,
+          "Failure to deallocate device pointer %p via memory manager", TgtPtr);
+  } else {
+    Res = free(TgtPtr);
+    if (Res)
+      return Plugin::error(
+          ErrorCode::UNKNOWN,
+          "Failure to deallocate device pointer %p via device deallocator",
+          TgtPtr);
+  }
+
+  return Plugin::success();
+}
+
 Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
                                   int64_t Size, __tgt_async_info *AsyncInfo) {
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 678be78b56af6..e7e2d20a85c1f 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -35,8 +35,7 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
   }
   case LIBC_FREE: {
     Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
-      Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                  TARGET_ALLOC_DEVICE_NON_BLOCKING);
+      Device.free_non_blocking(reinterpret_cast<void *>(Buffer->data[0]));
     });
     break;
   }
@@ -198,7 +197,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
 
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
   std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
-  Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+  Device.free(Buffers[Device.getDeviceId()]);
   Buffers[Device.getDeviceId()] = nullptr;
   Devices[Device.getDeviceId()] = nullptr;
   return Error::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e9bca4a24b344 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -84,6 +84,7 @@ DLWRAP(cuEventRecord, 2)
 DLWRAP(cuStreamWaitEvent, 3)
 DLWRAP(cuEventSynchronize, 1)
 DLWRAP(cuEventDestroy, 1)
+DLWRAP(cuPointerGetAttributes, 4)
 
 DLWRAP_FINALIZE()
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c494dc3453ba 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -256,6 +256,11 @@ typedef enum CUdevice_attribute_enum {
   CU_DEVICE_ATTRIBUTE_MAX,
 } CUdevice_attribute;
 
+typedef enum CUpointer_attribute_enum {
+  CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
+  CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,
+} CUpointer_attribute;
+
 typedef enum CUfunction_attribute_enum {
   CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
 } CUfunction_attribute;
@@ -284,6 +289,13 @@ typedef enum CUevent_flags_enum {
   CU_EVENT_INTERPROCESS = 0x4
 } CUevent_flags;
 
+typedef enum CUmemorytype_enum {
+  CU_MEMORYTYPE_HOST = 0x01,
+  CU_MEMORYTYPE_DEVICE = 0x02,
+  CU_MEMORYTYPE_ARRAY = 0x03,
+  CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+
 static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
 static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
@@ -370,5 +382,8 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuPointerGetAttributes(unsigned int numAttributes,
+                                CUpointer_attribute *attributes, void **data,
+                                CUdeviceptr ptr);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..425b4dff21224 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -602,7 +602,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
@@ -612,24 +612,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
     }
 
     CUresult Res;
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_SHARED:
-      Res = cuMemFree((CUdeviceptr)TgtPtr);
-      break;
-    case TARGET_ALLOC_HOST:
+
+    unsigned int IsManaged;
+    unsigned int Type;
+    void *AttributeValues[2] = {&IsManaged, &Type};
+    CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
+                                         CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
+    cuPointerGetAttributes(2, Attributes, AttributeValues,
+                           reinterpret_cast<CUdeviceptr>(TgtPtr));
+    if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
+      // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
+      // with cuMemFree
+      Res = cuMemFree(reinterpret_cast<CUdeviceptr>(TgtPtr));
+    } else {
+      // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
       Res = cuMemFreeHost(TgtPtr);
-      break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
-      cuStreamSynchronize(Stream);
-      if ((Res = cuStreamDestroy(Stream)))
-        break;
-    }
     }
 
     if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
@@ -639,6 +636,44 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return OFFLOAD_SUCCESS;
   }
 
+  int free_non_blocking(void *TgtPtr) override {
+    if (TgtPtr == nullptr)
+      return OFFLOAD_SUCCESS;
+
+    if (auto Err = setContext()) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    CUresult Res;
+    CUstream Stream;
+    Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
+    if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s")) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    Res = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+    if (auto Err = Plugin::check(Res, "Error in cuMemFreeAsync: %s")) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    Res = cuStreamSynchronize(Stream);
+    if (auto Err = Plugin::check(Res, "Error in cuStreamSynchronize: %s")) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    Res = cuStreamDestroy(Stream);
+    if (auto Err = Plugin::check(Res, "Error in cuStreamSynchronize: %s")) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
@@ -1239,7 +1274,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     Error Err = Plugin::success();
     AsyncInfoWrapper.finalize(Err);
 
-    if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
+    if (free(Buffer) != OFFLOAD_SUCCESS)
       return Plugin::error(ErrorCode::UNKNOWN,
                            "failed to free memory for global buffer");
 
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..6d1327f984aa2 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -239,11 +239,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// Free the memory. Use std::free in all cases.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr) override {
     std::free(TgtPtr);
     return OFFLOAD_SUCCESS;
   }
 
+  int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
   /// This plugin does nothing to lock buffers. Do not return an error, just
   /// return the same pointer as the device pointer.
   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {