[Openmp-commits] [openmp] f5f329a - [AMDGPU][Libomptarget] Rework logic for locating kernarg pools

Sun Jun 6 23:41:49 PDT 2021

Author: Pushpinder Singh
Date: 2021-06-07T06:41:37Z
New Revision: f5f329a37175c3a30e9d8b20b95fced96b9dde8c

URL: https://github.com/llvm/llvm-project/commit/f5f329a37175c3a30e9d8b20b95fced96b9dde8c
DIFF: https://github.com/llvm/llvm-project/commit/f5f329a37175c3a30e9d8b20b95fced96b9dde8c.diff

LOG: [AMDGPU][Libomptarget] Rework logic for locating kernarg pools

Previous logic was to always use the first kernarg pool found to allocate
kernel args. This patch changes this to use only the kernarg pool which
has non-zero size. This logic is also reworked to not use any globals.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D103600

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/impl/internal.h
    openmp/libomptarget/plugins/amdgpu/impl/system.cpp
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
index 7a8f0fd30b9cf..0cdd5ee9748da 100644

--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -181,8 +181,6 @@ struct SignalPoolT {
   };
 };
 
-extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 namespace core {
 hsa_status_t atl_init_gpu_context();
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index 85e79dc4058f2..9377e3a96c433 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -142,8 +142,6 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
 
 ATLMachine g_atl_machine;
 
-std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 /*
    atlc is all internal global values.
    The structure atl_context_t is defined in atl_internal.h
@@ -198,10 +196,6 @@ static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
       proc->addMemory(new_mem);
-      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
-        DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
-        atl_gpu_kernarg_pools.push_back(memory_pool);
-      }
     } else {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
       proc->addMemory(new_mem);

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 344c1be00f1cc..89345b6845593 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,6 +37,7 @@
 
 #include "Debug.h"
 #include "get_elf_mach_gfx_name.h"
+#include "machine.h"
 #include "omptargetplugin.h"
 #include "print_tracing.h"
 
@@ -136,15 +137,15 @@ struct KernelArgPool {
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;
 
-  KernelArgPool(uint32_t kernarg_segment_size)
+  KernelArgPool(uint32_t kernarg_segment_size,
+                hsa_amd_memory_pool_t &memory_pool)
       : kernarg_segment_size(kernarg_segment_size) {
 
     // atmi uses one pool per kernel for all gpus, with a fixed upper size
     // preserving that exact scheme here, including the queue<int>
 
     hsa_status_t err = hsa_amd_memory_pool_allocate(
-        atl_gpu_kernarg_pools[0],
-        kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+        memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
         &kernarg_region);
 
     if (err != HSA_STATUS_SUCCESS) {
@@ -224,7 +225,8 @@ struct KernelTy {
 
   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
            void *_CallStackAddr, const char *_Name,
-           uint32_t _kernarg_segment_size)
+           uint32_t _kernarg_segment_size,
+           hsa_amd_memory_pool_t &KernArgMemoryPool)
       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -232,8 +234,8 @@ struct KernelTy {
     std::string N(_Name);
     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
       KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(
-                                new KernelArgPool(_kernarg_segment_size))));
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+                                _kernarg_segment_size, KernArgMemoryPool))));
     }
   }
 };
@@ -297,6 +299,74 @@ uint16_t create_header() {
   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   return header;
 }
+
+hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
+  std::vector<hsa_amd_memory_pool_t> *Result =
+      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
+  bool AllocAllowed = false;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &AllocAllowed);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+            get_error_string(err));
+    return err;
+  }
+
+  if (!AllocAllowed) {
+    // nothing needs to be done here.
+    return HSA_STATUS_SUCCESS;
+  }
+
+  uint32_t GlobalFlags = 0;
+  err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err));
+    return err;
+  }
+
+  fprintf(stderr, "Flags : %d\n", GlobalFlags);
+  if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
+      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
+    size_t size = 0;
+    err = hsa_amd_memory_pool_get_info(MemoryPool,
+                                       HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
+    if (err != HSA_STATUS_SUCCESS) {
+      fprintf(stderr, "Get memory pool size failed: %s\n",
+              get_error_string(err));
+      return err;
+    }
+    if (size > 0)
+      Result->push_back(MemoryPool);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+std::pair<hsa_status_t, hsa_amd_memory_pool_t>
+FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
+  std::vector<hsa_amd_memory_pool_t> KernArgPools;
+  for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) {
+    hsa_agent_t Agent = processor.agent();
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    err = hsa_amd_agent_iterate_memory_pools(
+        Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
+    if (err != HSA_STATUS_SUCCESS) {
+      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+             "Iterate all memory pools", get_error_string(err));
+      return {err, hsa_amd_memory_pool_t{}};
+    }
+  }
+
+  if (KernArgPools.empty()) {
+    fprintf(stderr, "Unable to find any valid kernarg pool\n");
+    return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}};
+  }
+
+  return {HSA_STATUS_SUCCESS, KernArgPools[0]};
+}
+
 } // namespace
 } // namespace core
 
@@ -344,6 +414,8 @@ class RTLDeviceInfoTy {
   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
 
+  hsa_amd_memory_pool_t KernArgPool;
+
   struct atmiFreePtrDeletor {
     void operator()(void *p) {
       core::Runtime::Memfree(p); // ignore failure to free
@@ -477,6 +549,12 @@ class RTLDeviceInfoTy {
       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
     }
 
+    std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents);
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("Error when reading memory pools\n");
+      return;
+    }
+
     // Init the device info
     HSAQueues.resize(NumberOfDevices);
     FuncGblEntries.resize(NumberOfDevices);
@@ -1543,8 +1621,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
     }
 
     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name,
-                                   kernarg_segment_size));
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   DeviceInfo.KernArgPool));
     __tgt_offload_entry entry = *e;
     entry.addr = (void *)&KernelsList.back();
     DeviceInfo.addOffloadEntry(device_id, entry);