[llvm-branch-commits] [llvm] [Offload] Add GenericPluginTy::get_mem_info (PR #157484)
Ross Brunton via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Sep 9 03:12:15 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/157484
>From 7bf7fe1df8a873964df2ebc17328d9bef00f1347 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Mon, 8 Sep 2025 10:45:42 +0100
Subject: [PATCH 1/2] [Offload] Add GenericPluginTy::get_mem_info
This takes a pointer allocated by the plugin, and returns a struct
containing important information about it. This is now used in
`olMemFree` instead of using a map to track allocation info.
---
offload/include/omptarget.h | 2 +
offload/liboffload/src/OffloadImpl.cpp | 27 +--
.../amdgpu/dynamic_hsa/hsa.cpp | 1 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 3 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 31 ++-
.../common/include/PluginInterface.h | 13 ++
offload/plugins-nextgen/cuda/src/rtl.cpp | 216 +++++++++++-------
offload/plugins-nextgen/host/src/rtl.cpp | 5 +
8 files changed, 193 insertions(+), 105 deletions(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 8fd722bb15022..197cbd3806d91 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -96,6 +96,8 @@ enum OpenMPOffloadingDeclareTargetFlags {
OMP_REGISTER_REQUIRES = 0x10,
};
+// Note: This type should be no larger than 3 bits, as the amdgpu platform uses
+// the lower 3 bits of a pointer to store it
enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index fef3a5669e0d5..9620c35ac5c10 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -201,8 +201,6 @@ struct OffloadContext {
bool TracingEnabled = false;
bool ValidationEnabled = true;
- DenseMap<void *, AllocInfo> AllocInfoMap{};
- std::mutex AllocInfoMapMutex{};
SmallVector<ol_platform_impl_t, 4> Platforms{};
size_t RefCount;
@@ -624,32 +622,15 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
return Alloc.takeError();
*AllocationOut = *Alloc;
- {
- std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
- OffloadContext::get().AllocInfoMap.insert_or_assign(
- *Alloc, AllocInfo{Device, Type});
- }
return Error::success();
}
Error olMemFree_impl(ol_platform_handle_t Platform, void *Address) {
- ol_device_handle_t Device;
- ol_alloc_type_t Type;
- {
- std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
- if (!OffloadContext::get().AllocInfoMap.contains(Address))
- return createOffloadError(ErrorCode::INVALID_ARGUMENT,
- "address is not a known allocation");
-
- auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
- Device = AllocInfo.Device;
- Type = AllocInfo.Type;
- OffloadContext::get().AllocInfoMap.erase(Address);
- }
- assert(Platform == Device->Platform);
+ auto MemInfo = Platform->Plugin->get_memory_info(Address);
+ if (auto Err = MemInfo.takeError())
+ return Err;
- if (auto Res =
- Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)))
+ if (auto Res = MemInfo->Device->dataDelete(Address, MemInfo->Type))
return Res;
return Error::success();
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index bc92f4a46a5c0..7f0e75cb9b500 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -68,6 +68,7 @@ DLWRAP(hsa_amd_register_system_event_handler, 2)
DLWRAP(hsa_amd_signal_create, 5)
DLWRAP(hsa_amd_signal_async_handler, 5)
DLWRAP(hsa_amd_pointer_info, 5)
+DLWRAP(hsa_amd_pointer_info_set_userdata, 2)
DLWRAP(hsa_code_object_reader_create_from_memory, 3)
DLWRAP(hsa_code_object_reader_destroy, 1)
DLWRAP(hsa_executable_load_agent_code_object, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..5c2fbd127c86d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -160,6 +160,7 @@ typedef struct hsa_amd_pointer_info_s {
void* agentBaseAddress;
void* hostBaseAddress;
size_t sizeInBytes;
+ void *userData;
} hsa_amd_pointer_info_t;
hsa_status_t hsa_amd_pointer_info(const void* ptr,
@@ -168,6 +169,8 @@ hsa_status_t hsa_amd_pointer_info(const void* ptr,
uint32_t* num_agents_accessible,
hsa_agent_t** accessible);
+hsa_status_t hsa_amd_pointer_info_set_userdata(const void *ptr, void *userdata);
+
#ifdef __cplusplus
}
#endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c26cfe961aa0e..90d9ca9f787e7 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -96,6 +96,8 @@ struct AMDGPUMemoryPoolTy;
namespace hsa_utils {
+using UserDataPair = PointerIntPair<AMDGPUDeviceTy *, 3>;
+
/// Iterate elements using an HSA iterate function. Do not use this function
/// directly but the specialized ones below instead.
template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
@@ -2032,7 +2034,8 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
/// Class implementing the AMDGPU device functionalities which derives from the
/// generic device class.
-struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
+struct alignas(8) alignas(void *) AMDGPUDeviceTy : public GenericDeviceTy,
+ AMDGenericDeviceTy {
// Create an AMDGPU device with a device id and default AMDGPU grid values.
AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
@@ -3561,6 +3564,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
return KernelAgents;
}
+ Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override {
+ hsa_amd_pointer_info_t Info;
+ if (auto Err = Plugin::check(
+ hsa_amd_pointer_info(TgtPtr, &Info, nullptr, nullptr, nullptr),
+ "error in hsa_amd_pointer_info: %s"))
+ return Err;
+
+ // The pointer info struct contains an "agent" field, but that doesn't
+ // necessarily map to the device that created it
+ MemoryInfoTy ToReturn;
+ ToReturn.Base = Info.agentBaseAddress;
+ ToReturn.Size = Info.size;
+ auto UserData = hsa_utils::UserDataPair::getFromOpaqueValue(Info.userData);
+ ToReturn.Type = static_cast<TargetAllocTy>(UserData.getInt());
+ ToReturn.Device =
+ reinterpret_cast<GenericDeviceTy *>(UserData.getPointer());
+
+ return ToReturn;
+ }
+
private:
/// Event handler that will be called by ROCr if an event is detected.
static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
@@ -3882,6 +3905,12 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
REPORT("%s\n", toString(std::move(Err)).data());
return nullptr;
}
+
+ auto UserData = hsa_utils::UserDataPair(this, Kind);
+ if (auto Err = Plugin::check(
+ hsa_amd_pointer_info_set_userdata(Alloc, UserData.getOpaqueValue()),
+ "error in hsa_amd_pointer_info_set_userdata: %s"))
+ REPORT("%s\n", toString(std::move(Err)).data());
}
return Alloc;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f0c05a1b90716..ce270b3f1ab9a 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1271,6 +1271,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};
+struct MemoryInfoTy {
+ void *Base;
+ size_t Size;
+ TargetAllocTy Type;
+ GenericDeviceTy *Device;
+
+ void *limit() const { return reinterpret_cast<char *>(Base) + Size; }
+};
+
/// Class implementing common functionalities of offload plugins. Each plugin
/// should define the specific plugin class, derive from this generic one, and
/// implement the necessary virtual function members.
@@ -1567,6 +1576,10 @@ struct GenericPluginTy {
/// object and return immediately.
int32_t async_barrier(omp_interop_val_t *Interop);
+ /// Given a pointer previously allocated by GenericDeviceTy::Allocate, return
+ /// information about that allocation
+ virtual Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) = 0;
+
private:
/// Indicates if the platform runtime has been fully initialized.
bool Initialized = false;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index af3c74636bff3..f04049cd8bdfd 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -566,89 +566,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
}
/// Allocate memory on the device or related to the device.
- void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
- if (Size == 0)
- return nullptr;
-
- if (auto Err = setContext()) {
- REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
- return nullptr;
- }
-
- void *MemAlloc = nullptr;
- CUdeviceptr DevicePtr;
- CUresult Res;
-
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- Res = cuMemAlloc(&DevicePtr, Size);
- MemAlloc = (void *)DevicePtr;
- break;
- case TARGET_ALLOC_HOST:
- Res = cuMemAllocHost(&MemAlloc, Size);
- break;
- case TARGET_ALLOC_SHARED:
- Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
- MemAlloc = (void *)DevicePtr;
- break;
- case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
- CUstream Stream;
- if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
- break;
- if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
- break;
- cuStreamSynchronize(Stream);
- Res = cuStreamDestroy(Stream);
- MemAlloc = (void *)DevicePtr;
- }
- }
-
- if (auto Err =
- Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) {
- REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
- return nullptr;
- }
- return MemAlloc;
- }
+ void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
/// Deallocate memory on the device or related to the device.
- int free(void *TgtPtr, TargetAllocTy Kind) override {
- if (TgtPtr == nullptr)
- return OFFLOAD_SUCCESS;
-
- if (auto Err = setContext()) {
- REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
- return OFFLOAD_FAIL;
- }
-
- CUresult Res;
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_SHARED:
- Res = cuMemFree((CUdeviceptr)TgtPtr);
- break;
- case TARGET_ALLOC_HOST:
- Res = cuMemFreeHost(TgtPtr);
- break;
- case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
- CUstream Stream;
- if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
- break;
- cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
- cuStreamSynchronize(Stream);
- if ((Res = cuStreamDestroy(Stream)))
- break;
- }
- }
-
- if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
- REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
- return OFFLOAD_FAIL;
- }
- return OFFLOAD_SUCCESS;
- }
+ int free(void *TgtPtr, TargetAllocTy Kind) override;
/// Synchronize current thread with the pending operations on the async info.
Error synchronizeImpl(__tgt_async_info &AsyncInfo,
@@ -1612,8 +1533,141 @@ struct CUDAPluginTy final : public GenericPluginTy {
// revision.
return Major == ImageMajor && Minor >= ImageMinor;
}
+
+ Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override {
+ std::lock_guard<std::mutex> Lock(MemoryInfoMutex);
+
+ // TODO: Is it possible to query this information using CUDA directly?
+
+ // Fast case, we have been given the base pointer directly
+ if (MemoryInfo.contains(TgtPtr))
+ return MemoryInfo[TgtPtr];
+
+ // Slower case, we need to look up the base pointer first
+ auto Loc = std::lower_bound(MemoryBases.begin(), MemoryBases.end(), TgtPtr,
+ [&](const void *Iter, const void *Val) {
+ return MemoryInfo[Iter].limit() < Val;
+ });
+ if (Loc == MemoryBases.end() || TgtPtr > MemoryInfo[*Loc].limit())
+ return Plugin::error(ErrorCode::NOT_FOUND,
+ "allocated memory information not found");
+ return MemoryInfo[*Loc];
+ }
+
+private:
+ friend CUDADeviceTy;
+
+ std::mutex MemoryInfoMutex;
+ llvm::SmallVector<const void *> MemoryBases;
+ llvm::DenseMap<const void *, MemoryInfoTy> MemoryInfo;
};
+void *CUDADeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
+ if (Size == 0)
+ return nullptr;
+
+ if (auto Err = setContext()) {
+ REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
+ return nullptr;
+ }
+
+ void *MemAlloc = nullptr;
+ CUdeviceptr DevicePtr;
+ CUresult Res;
+
+ switch (Kind) {
+ case TARGET_ALLOC_DEFAULT:
+ case TARGET_ALLOC_DEVICE:
+ Res = cuMemAlloc(&DevicePtr, Size);
+ MemAlloc = (void *)DevicePtr;
+ break;
+ case TARGET_ALLOC_HOST:
+ Res = cuMemAllocHost(&MemAlloc, Size);
+ break;
+ case TARGET_ALLOC_SHARED:
+ Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
+ MemAlloc = (void *)DevicePtr;
+ break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+ break;
+ cuStreamSynchronize(Stream);
+ Res = cuStreamDestroy(Stream);
+ MemAlloc = (void *)DevicePtr;
+ }
+ }
+
+ if (auto Err = Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) {
+ REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
+ return nullptr;
+ }
+
+ auto CudaPlugin = reinterpret_cast<CUDAPluginTy *>(&Plugin);
+ {
+ std::lock_guard<std::mutex> Lock(CudaPlugin->MemoryInfoMutex);
+ CudaPlugin->MemoryBases.insert(
+ std::lower_bound(CudaPlugin->MemoryBases.begin(),
+ CudaPlugin->MemoryBases.end(), MemAlloc),
+ MemAlloc);
+ CudaPlugin->MemoryInfo[MemAlloc] = MemoryInfoTy{
+ /*Base=*/MemAlloc,
+ /*Size=*/Size,
+ /*Type=*/Kind,
+ /*Device=*/this,
+ };
+ }
+ return MemAlloc;
+}
+
+int CUDADeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+ if (TgtPtr == nullptr)
+ return OFFLOAD_SUCCESS;
+
+ if (auto Err = setContext()) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ CUresult Res;
+ switch (Kind) {
+ case TARGET_ALLOC_DEFAULT:
+ case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_SHARED:
+ Res = cuMemFree((CUdeviceptr)TgtPtr);
+ break;
+ case TARGET_ALLOC_HOST:
+ Res = cuMemFreeHost(TgtPtr);
+ break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ cuStreamSynchronize(Stream);
+ if ((Res = cuStreamDestroy(Stream)))
+ break;
+ }
+ }
+
+ if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
+ REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+
+ auto CudaPlugin = reinterpret_cast<CUDAPluginTy *>(&Plugin);
+ {
+ std::lock_guard<std::mutex> Lock(CudaPlugin->MemoryInfoMutex);
+ CudaPlugin->MemoryBases.erase(std::find(CudaPlugin->MemoryBases.begin(),
+ CudaPlugin->MemoryBases.end(),
+ TgtPtr));
+ assert(CudaPlugin->MemoryInfo.erase(TgtPtr));
+ }
+ return OFFLOAD_SUCCESS;
+}
+
Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
GenericDeviceTy &DstGenericDevice,
void *DstPtr, int64_t Size,
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index f440ebaf17fe4..0f57043da448c 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -517,6 +517,11 @@ struct GenELF64PluginTy final : public GenericPluginTy {
#endif
}
+ Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "get_memory_info not implemented for host device");
+ }
+
const char *getName() const override { return GETNAME(TARGET_NAME); }
};
>From dfc89b4699156c0f82841cf0681405377ebc5763 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 9 Sep 2025 11:11:58 +0100
Subject: [PATCH 2/2] Don't use auto for MemInfo
---
offload/liboffload/src/OffloadImpl.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 9620c35ac5c10..f399a74989036 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -626,7 +626,7 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
}
Error olMemFree_impl(ol_platform_handle_t Platform, void *Address) {
- auto MemInfo = Platform->Plugin->get_memory_info(Address);
+ Expected<MemoryInfoTy> MemInfo = Platform->Plugin->get_memory_info(Address);
if (auto Err = MemInfo.takeError())
return Err;
More information about the llvm-branch-commits
mailing list