[Openmp-commits] [openmp] 20df2c7 - [AMDGPU][Libomptarget] Collect allocatable memory pools using HSA
Pushpinder Singh via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jun 28 04:28:15 PDT 2021
Author: Pushpinder Singh
Date: 2021-06-28T11:28:04Z
New Revision: 20df2c7052c09934ce87ccc409da9d3dc24b7ca0
URL: https://github.com/llvm/llvm-project/commit/20df2c7052c09934ce87ccc409da9d3dc24b7ca0
DIFF: https://github.com/llvm/llvm-project/commit/20df2c7052c09934ce87ccc409da9d3dc24b7ca0.diff
LOG: [AMDGPU][Libomptarget] Collect allocatable memory pools using HSA
The logic is almost similar to that of system.cpp with one change that
instead of adding all the memory pools to a device struct it only
keeps a single pool. The existing approach also always allocated memory on
the first HSA pool found for a GPU.
This depends on D104691. The goal of this series of patches is to remove
_atl_machine global. The next patch will drop g_atl_machine entirely.
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D104695
Added:
Modified:
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index a8252f2cb4b00..9a07d26546bbc 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -17,6 +17,7 @@
#include <cstring>
#include <elf.h>
#include <fstream>
+#include <functional>
#include <iostream>
#include <libelf.h>
#include <list>
@@ -104,6 +105,16 @@ template <typename C> hsa_status_t iterate_agents(C cb) {
return hsa_iterate_agents(L, static_cast<void *>(&cb));
}
+template <typename C>
+hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
+ auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
+ C *unwrapped = static_cast<C *>(data);
+ return (*unwrapped)(MemoryPool);
+ };
+
+ return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
+}
+
} // namespace hsa
/// Keep entries table per device
@@ -329,18 +340,60 @@ hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
return err;
}
+ size_t size = 0;
+ err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
+ &size);
+ if (err != HSA_STATUS_SUCCESS) {
+ fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err));
+ return err;
+ }
+
if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
- (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
- size_t size = 0;
- err = hsa_amd_memory_pool_get_info(MemoryPool,
- HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
- if (err != HSA_STATUS_SUCCESS) {
- fprintf(stderr, "Get memory pool size failed: %s\n",
- get_error_string(err));
- return err;
+ (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) &&
+ size > 0) {
+ Result->push_back(MemoryPool);
+ }
+
+ return HSA_STATUS_SUCCESS;
+}
+
+std::pair<hsa_status_t, bool>
+isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
+ bool AllocAllowed = false;
+ hsa_status_t Err = hsa_amd_memory_pool_get_info(
+ MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+ &AllocAllowed);
+ if (Err != HSA_STATUS_SUCCESS) {
+ fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+ get_error_string(Err));
+ return {Err, false};
+ }
+
+ return {HSA_STATUS_SUCCESS, AllocAllowed};
+}
+
+template <typename AccumulatorFunc>
+hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents,
+ AccumulatorFunc Func) {
+ for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
+ hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
+ Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
+ hsa_status_t Err;
+ bool Valid = false;
+ std::tie(Err, Valid) = isValidMemoryPool(MemoryPool);
+ if (Err != HSA_STATUS_SUCCESS) {
+ return Err;
+ }
+ if (Valid)
+ Func(MemoryPool, DeviceId);
+ return HSA_STATUS_SUCCESS;
+ });
+
+ if (Err != HSA_STATUS_SUCCESS) {
+ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+ "Iterate all memory pools", get_error_string(Err));
+ return Err;
}
- if (size > 0)
- Result->push_back(MemoryPool);
}
return HSA_STATUS_SUCCESS;
@@ -421,6 +474,13 @@ class RTLDeviceInfoTy {
hsa_amd_memory_pool_t KernArgPool;
+ // fine grained memory pool for host allocations
+ hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
+
+ // fine and coarse-grained memory pools per offloading device
+ std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
+ std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
+
struct atmiFreePtrDeletor {
void operator()(void *p) {
core::Runtime::Memfree(p); // ignore failure to free
@@ -523,6 +583,82 @@ class RTLDeviceInfoTy {
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
}
+ hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
+ int DeviceId) {
+ assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
+ uint32_t GlobalFlags = 0;
+ hsa_status_t Err = hsa_amd_memory_pool_get_info(
+ MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+
+ if (Err != HSA_STATUS_SUCCESS) {
+ return Err;
+ }
+
+ if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
+ DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
+ } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
+ DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
+ }
+
+ return HSA_STATUS_SUCCESS;
+ }
+
+ hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool,
+ int DeviceId) {
+ uint32_t GlobalFlags = 0;
+ hsa_status_t Err = hsa_amd_memory_pool_get_info(
+ MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+
+ if (Err != HSA_STATUS_SUCCESS) {
+ return Err;
+ }
+
+ uint32_t Size;
+ Err = hsa_amd_memory_pool_get_info(MemoryPool,
+ HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size);
+ if (Err != HSA_STATUS_SUCCESS) {
+ return Err;
+ }
+
+ if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED &&
+ Size > 0) {
+ HostFineGrainedMemoryPool = MemoryPool;
+ }
+
+ return HSA_STATUS_SUCCESS;
+ }
+
+ hsa_status_t setupMemoryPools() {
+ using namespace std::placeholders;
+ hsa_status_t Err;
+ Err = core::collectMemoryPools(
+ CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2));
+ if (Err != HSA_STATUS_SUCCESS) {
+ fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n",
+ get_error_string(Err));
+ return Err;
+ }
+ Err = core::collectMemoryPools(
+ HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2));
+ if (Err != HSA_STATUS_SUCCESS) {
+ fprintf(stderr,
+ "HSA error in collecting memory pools for offload devices: %s\n",
+ get_error_string(Err));
+ return Err;
+ }
+ return HSA_STATUS_SUCCESS;
+ }
+
+ hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
+ assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
+ "Invalid device Id");
+ return DeviceCoarseGrainedMemoryPools[DeviceId];
+ }
+
+ hsa_amd_memory_pool_t getHostMemoryPool() {
+ return HostFineGrainedMemoryPool;
+ }
+
RTLDeviceInfoTy() {
// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
// anytime. You do not need a debug library build.
@@ -581,6 +717,14 @@ class RTLDeviceInfoTy {
deviceStateStore.resize(NumberOfDevices);
KernelInfoTable.resize(NumberOfDevices);
SymbolInfoTable.resize(NumberOfDevices);
+ DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
+ DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
+
+ err = setupMemoryPools();
+ if (err != HSA_STATUS_SUCCESS) {
+ DP("Error when setting up memory pools");
+ return;
+ }
for (int i = 0; i < NumberOfDevices; i++) {
HSAQueues[i] = nullptr;
More information about the Openmp-commits
mailing list