[llvm] [Offload] Allow allocations to be freed without knowing their type (PR #140889)
Callum Fare via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 05:53:41 PDT 2025
https://github.com/callumfare created https://github.com/llvm/llvm-project/pull/140889
In liboffload we want users to be able to free memory with `olMemFree` without knowing the allocation's type. Currently we track every allocation's associated device and type in a map, and we'd like to remove that workaround. This removes the allocation type from the tracked info.
- Add an overload of `GenericDeviceTy::dataDelete` that doesn't take a `TargetAllocTy`
- Remove the kind argument from `DeviceAllocatorTy::free`
- In the AMDGPU plugin, we can free allocations like this already. Make the relevant method static and remove the code to determine the memory pool as it never actually mattered
- In the CUDA plugin, determine what type of memory an allocation is with `cuPointerGetAttributes`
- Split out `free_non_blocking` from `free` in `DeviceAllocatorTy` so non-blocking device allocations can still be freed
>From ff6cd4ac539e9cb7fe567419975a11ac446127f2 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 21 May 2025 13:11:43 +0100
Subject: [PATCH] [Offload] Allow allocations to be freed without knowing their
type
---
offload/liboffload/src/OffloadImpl.cpp | 7 +-
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 32 ++------
.../common/include/MemoryManager.h | 6 +-
.../common/include/PluginInterface.h | 4 +
.../common/src/PluginInterface.cpp | 25 ++++++-
offload/plugins-nextgen/common/src/RPC.cpp | 5 +-
.../cuda/dynamic_cuda/cuda.cpp | 1 +
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 15 ++++
offload/plugins-nextgen/cuda/src/rtl.cpp | 73 ++++++++++++++-----
offload/plugins-nextgen/host/src/rtl.cpp | 4 +-
10 files changed, 118 insertions(+), 54 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index ea65282e3ba52..e9c83c2ecbce2 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -89,7 +89,6 @@ namespace offload {
struct AllocInfo {
ol_device_handle_t Device;
- ol_alloc_type_t Type;
};
using AllocInfoMapT = DenseMap<void *, AllocInfo>;
@@ -314,7 +313,7 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
return ol_impl_result_t::fromError(Alloc.takeError());
*AllocationOut = *Alloc;
- allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type});
+ allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device});
return OL_SUCCESS;
}
@@ -324,10 +323,8 @@ ol_impl_result_t olMemFree_impl(void *Address) {
auto AllocInfo = allocInfoMap().at(Address);
auto Device = AllocInfo.Device;
- auto Type = AllocInfo.Type;
- auto Res =
- Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type));
+ auto Res = Device->Device->dataDelete(Address);
if (Res)
return ol_impl_result_t::fromError(std::move(Res));
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 2733796611d9b..691918da1e8b1 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -312,7 +312,7 @@ struct AMDGPUMemoryPoolTy {
}
/// Return memory to the memory pool.
- Error deallocate(void *Ptr) {
+ static Error deallocate(void *Ptr) {
hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
return Plugin::check(Status, "error in hsa_amd_memory_pool_free: %s");
}
@@ -444,7 +444,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
/// Deallocation callback that will be called by the memory manager.
- int free(void *TgtPtr, TargetAllocTy Kind) override {
+ int free(void *TgtPtr) override {
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
consumeError(std::move(Err));
return OFFLOAD_FAIL;
@@ -452,6 +452,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
return OFFLOAD_SUCCESS;
}
+ int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
/// The underlying plugin that owns this memory manager.
AMDGPUPluginTy &Plugin;
@@ -2219,31 +2221,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
/// Deallocate memory on the device or related to the device.
- int free(void *TgtPtr, TargetAllocTy Kind) override {
+ int free(void *TgtPtr) override {
if (TgtPtr == nullptr)
return OFFLOAD_SUCCESS;
- AMDGPUMemoryPoolTy *MemoryPool = nullptr;
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
- MemoryPool = CoarseGrainedMemoryPools[0];
- break;
- case TARGET_ALLOC_HOST:
- MemoryPool = &HostDevice.getFineGrainedMemoryPool();
- break;
- case TARGET_ALLOC_SHARED:
- MemoryPool = &HostDevice.getFineGrainedMemoryPool();
- break;
- }
-
- if (!MemoryPool) {
- REPORT("No memory pool for the specified allocation kind\n");
- return OFFLOAD_FAIL;
- }
-
- if (Error Err = MemoryPool->deallocate(TgtPtr)) {
+ if (Error Err = AMDGPUMemoryPoolTy::deallocate(TgtPtr)) {
REPORT("%s\n", toString(std::move(Err)).data());
return OFFLOAD_FAIL;
}
@@ -2251,6 +2233,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return OFFLOAD_SUCCESS;
}
+ int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
/// Synchronize current thread with the pending operations on the async info.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
AMDGPUStreamTy *Stream =
diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h
index a4f6e628c403a..3e7102d5dc8bc 100644
--- a/offload/plugins-nextgen/common/include/MemoryManager.h
+++ b/offload/plugins-nextgen/common/include/MemoryManager.h
@@ -36,7 +36,11 @@ class DeviceAllocatorTy {
TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
/// Delete the pointer \p TgtPtr on the device
- virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+ virtual int free(void *TgtPtr) = 0;
+
+ /// Delete the pointer \p TgtPtr, of type TARGET_ALLOC_DEVICE_NON_BLOCKING,
+ /// on the device.
+ virtual int free_non_blocking(void *TgtPtr) = 0;
};
/// Class of memory manager. The memory manager is per-device by using
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index fa13e26bc3483..dbd71efd868ca 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -771,6 +771,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Deallocate data from the device or involving the device.
Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
+ /// Deallocate data from the device or involving the device, where the
+ /// allocation type is unknown.
+ Error dataDelete(void *TgtPtr);
+
/// Pin host memory to optimize transfers and return the device accessible
/// pointer that devices should use for memory transfers involving the host
/// pinned allocation.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9e316adad8f4..0423c97fd9645 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1479,7 +1479,7 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
[[fallthrough]];
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
- Res = free(TgtPtr, Kind);
+ Res = free(TgtPtr);
if (Res)
return Plugin::error(
ErrorCode::UNKNOWN,
@@ -1495,6 +1495,29 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
return Plugin::success();
}
+Error GenericDeviceTy::dataDelete(void *TgtPtr) {
+ // Try to free via the memory manager if it exists. If the allocation isn't
+ // managed by the memory manager it will fall back to the device allocator
+ // which should succeed.
+ int Res;
+ if (MemoryManager) {
+ Res = MemoryManager->free(TgtPtr);
+ if (Res)
+ return Plugin::error(
+ ErrorCode::UNKNOWN,
+ "Failure to deallocate device pointer %p via memory manager", TgtPtr);
+ } else {
+ Res = free(TgtPtr);
+ if (Res)
+ return Plugin::error(
+ ErrorCode::UNKNOWN,
+ "Failure to deallocate device pointer %p via device deallocator",
+ TgtPtr);
+ }
+
+ return Plugin::success();
+}
+
Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 678be78b56af6..e7e2d20a85c1f 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -35,8 +35,7 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
}
case LIBC_FREE: {
Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
- Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE_NON_BLOCKING);
+ Device.free_non_blocking(reinterpret_cast<void *>(Buffer->data[0]));
});
break;
}
@@ -198,7 +197,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
- Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+ Device.free(Buffers[Device.getDeviceId()]);
Buffers[Device.getDeviceId()] = nullptr;
Devices[Device.getDeviceId()] = nullptr;
return Error::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e9bca4a24b344 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -84,6 +84,7 @@ DLWRAP(cuEventRecord, 2)
DLWRAP(cuStreamWaitEvent, 3)
DLWRAP(cuEventSynchronize, 1)
DLWRAP(cuEventDestroy, 1)
+DLWRAP(cuPointerGetAttributes, 4)
DLWRAP_FINALIZE()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c494dc3453ba 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -256,6 +256,11 @@ typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_MAX,
} CUdevice_attribute;
+typedef enum CUpointer_attribute_enum {
+ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
+ CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,
+} CUpointer_attribute;
+
typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
} CUfunction_attribute;
@@ -284,6 +289,13 @@ typedef enum CUevent_flags_enum {
CU_EVENT_INTERPROCESS = 0x4
} CUevent_flags;
+typedef enum CUmemorytype_enum {
+ CU_MEMORYTYPE_HOST = 0x01,
+ CU_MEMORYTYPE_DEVICE = 0x02,
+ CU_MEMORYTYPE_ARRAY = 0x03,
+ CU_MEMORYTYPE_UNIFIED = 0x04
+} CUmemorytype;
+
static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
@@ -370,5 +382,8 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
CUresult cuMemGetAllocationGranularity(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
+CUresult cuPointerGetAttributes(unsigned int numAttributes,
+ CUpointer_attribute *attributes, void **data,
+ CUdeviceptr ptr);
#endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..425b4dff21224 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -602,7 +602,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Deallocate memory on the device or related to the device.
- int free(void *TgtPtr, TargetAllocTy Kind) override {
+ int free(void *TgtPtr) override {
if (TgtPtr == nullptr)
return OFFLOAD_SUCCESS;
@@ -612,24 +612,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
CUresult Res;
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_SHARED:
- Res = cuMemFree((CUdeviceptr)TgtPtr);
- break;
- case TARGET_ALLOC_HOST:
+
+ unsigned int IsManaged;
+ unsigned int Type;
+ void *AttributeValues[2] = {&IsManaged, &Type};
+ CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
+ CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
+ cuPointerGetAttributes(2, Attributes, AttributeValues,
+ reinterpret_cast<CUdeviceptr>(TgtPtr));
+ if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
+ // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
+ // with cuMemFree
+ Res = cuMemFree(reinterpret_cast<CUdeviceptr>(TgtPtr));
+ } else {
+ // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
Res = cuMemFreeHost(TgtPtr);
- break;
- case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
- CUstream Stream;
- if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
- break;
- cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
- cuStreamSynchronize(Stream);
- if ((Res = cuStreamDestroy(Stream)))
- break;
- }
}
if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
@@ -639,6 +636,44 @@ struct CUDADeviceTy : public GenericDeviceTy {
return OFFLOAD_SUCCESS;
}
+ int free_non_blocking(void *TgtPtr) override {
+ if (TgtPtr == nullptr)
+ return OFFLOAD_SUCCESS;
+
+ if (auto Err = setContext()) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ CUresult Res;
+ CUstream Stream;
+ Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
+ if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s")) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ Res = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ if (auto Err = Plugin::check(Res, "Error in cuMemFreeAsync: %s")) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ Res = cuStreamSynchronize(Stream);
+ if (auto Err = Plugin::check(Res, "Error in cuStreamSynchronize: %s")) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ Res = cuStreamDestroy(Stream);
+ if (auto Err = Plugin::check(Res, "Error in cuStreamSynchronize: %s")) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+ }
+
/// Synchronize current thread with the pending operations on the async info.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
@@ -1239,7 +1274,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
- if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
+ if (free(Buffer) != OFFLOAD_SUCCESS)
return Plugin::error(ErrorCode::UNKNOWN,
"failed to free memory for global buffer");
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..6d1327f984aa2 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -239,11 +239,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
}
/// Free the memory. Use std::free in all cases.
- int free(void *TgtPtr, TargetAllocTy Kind) override {
+ int free(void *TgtPtr) override {
std::free(TgtPtr);
return OFFLOAD_SUCCESS;
}
+ int free_non_blocking(void *TgtPtr) override { return free(TgtPtr); }
+
/// This plugin does nothing to lock buffers. Do not return an error, just
/// return the same pointer as the device pointer.
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
More information about the llvm-commits
mailing list