[Openmp-commits] [openmp] 5d989fb - [libomptarget][amdgpu] Improve thread safety, remove dead code
via Openmp-commits
openmp-commits at lists.llvm.org
Wed Aug 26 14:04:58 PDT 2020
Author: JonChesterfield
Date: 2020-08-26T22:04:03+01:00
New Revision: 5d989fb37d7cfb4f7766a45d4efc82b5add3811f
URL: https://github.com/llvm/llvm-project/commit/5d989fb37d7cfb4f7766a45d4efc82b5add3811f
DIFF: https://github.com/llvm/llvm-project/commit/5d989fb37d7cfb4f7766a45d4efc82b5add3811f.diff
LOG: [libomptarget][amdgpu] Improve thread safety, remove dead code
Added:
Modified:
openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
openmp/libomptarget/plugins/amdgpu/impl/atmi.h
openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
openmp/libomptarget/plugins/amdgpu/impl/data.cpp
openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
openmp/libomptarget/plugins/amdgpu/impl/machine.h
openmp/libomptarget/plugins/amdgpu/impl/rt.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Removed:
openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
################################################################################
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
index 3856403504a9..6a6bc8c35c87 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -33,8 +33,9 @@ atmi_status_t atmi_module_register_from_memory_to_place(
/*
* Data
*/
-atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) {
- return core::Runtime::Memcpy(dest, src, size);
+atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
+ size_t size) {
+ return core::Runtime::Memcpy(sig, dest, src, size);
}
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h
index bfe95f93dbaf..35e61216ce78 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h
@@ -141,15 +141,6 @@ typedef struct atmi_device_s {
* Device type: CPU, GPU or DSP
*/
atmi_devtype_t type;
- /**
- * The number of compute cores
- */
- unsigned int core_count;
- /**
- * The number of memory spaces/regions that are accessible
- * from this device
- */
- unsigned int memory_count;
/**
* Array of memory spaces/regions that are accessible
* from this device.
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
index ac52b89cb4f0..eb4a46c35a9b 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
@@ -18,7 +18,7 @@ atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
size_t var_size;
atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
&var_size);
- atmi_memcpy(host_add, var_addr, var_size);
+ atmi_memcpy(signal, host_add, var_addr, var_size);
*/
if (!atl_is_atmi_initialized())
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
deleted file mode 100644
index 4d8323e55651..000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*===--------------------------------------------------------------------------
- * ATMI (Asynchronous Task and Memory Interface)
- *
- * This file is distributed under the MIT License. See LICENSE.txt for details.
- *===------------------------------------------------------------------------*/
-#ifndef INCLUDE_ATMI_KL_H_
-#define INCLUDE_ATMI_KL_H_
-
-#include "atmi.h"
-#ifdef __OPENCL_C_VERSION__
-#include "ockl_hsa.h"
-#endif
-#define MAX_NUM_KERNELS (1024 * 16)
-
-typedef struct atmi_implicit_args_s {
- unsigned long offset_x;
- unsigned long offset_y;
- unsigned long offset_z;
- unsigned long hostcall_ptr;
- char num_gpu_queues;
- unsigned long gpu_queue_ptr;
- char num_cpu_queues;
- unsigned long cpu_worker_signals;
- unsigned long cpu_queue_ptr;
- unsigned long kernarg_template_ptr;
- // possible TODO: send signal pool to be used by DAGs on GPU
- // uint8_t num_signals;
- // unsigned long signal_ptr;
-} atmi_implicit_args_t;
-
-typedef struct atmi_kernel_enqueue_template_s {
- unsigned long kernel_handle;
- hsa_kernel_dispatch_packet_t k_packet;
- hsa_agent_dispatch_packet_t a_packet;
- unsigned long kernarg_segment_size;
- void *kernarg_regions;
-} atmi_kernel_enqueue_template_t;
-
-#endif // INCLUDE_ATMI_KL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
index 04fddd0b2d61..8ee78dfca3e4 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -7,6 +7,7 @@
#define INCLUDE_ATMI_RUNTIME_H_
#include "atmi.h"
+#include "hsa.h"
#include <inttypes.h>
#include <stdlib.h>
#ifndef __cplusplus
@@ -178,13 +179,31 @@ atmi_status_t atmi_free(void *ptr);
* @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
*
*/
-atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size);
+atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
+ size_t size);
-/** @} */
+static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src,
+ size_t size) {
+ hsa_signal_t sig;
+ hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig);
+ if (err != HSA_STATUS_SUCCESS) {
+ return ATMI_STATUS_ERROR;
+ }
-/** \defgroup cpu_dev_runtime ATMI CPU Device Runtime
- * @{
- */
+ atmi_status_t r = atmi_memcpy(sig, dest, src, size);
+ hsa_status_t rc = hsa_signal_destroy(sig);
+
+ if (r != ATMI_STATUS_SUCCESS) {
+ return r;
+ }
+ if (rc != HSA_STATUS_SUCCESS) {
+ return ATMI_STATUS_ERROR;
+ }
+
+ return ATMI_STATUS_SUCCESS;
+}
+
+/** @} */
#ifdef __cplusplus
}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
index 1d20fc911c51..91627bd8d4b3 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
@@ -19,7 +19,6 @@
using core::TaskImpl;
extern ATLMachine g_atl_machine;
-extern hsa_signal_t IdentityCopySignal;
namespace core {
ATLPointerTracker g_data_map; // Track all am pointer allocations.
@@ -130,41 +129,59 @@ atmi_status_t Runtime::Memfree(void *ptr) {
return ret;
}
-static hsa_status_t invoke_hsa_copy(void *dest, const void *src, size_t size,
+static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
+ const void *src, size_t size,
hsa_agent_t agent) {
- // TODO: Use thread safe signal
- hsa_signal_store_screlease(IdentityCopySignal, 1);
+ const hsa_signal_value_t init = 1;
+ const hsa_signal_value_t success = 0;
+ hsa_signal_store_screlease(sig, init);
+
+ hsa_status_t err =
+ hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
+ if (err != HSA_STATUS_SUCCESS) {
+ return err;
+ }
- hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
- NULL, IdentityCopySignal);
- ErrorCheck(Copy async between memory pools, err);
+ // async_copy reports success by decrementing and failure by setting to < 0
+ hsa_signal_value_t got = init;
+ while (got == init) {
+ got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
+ UINT64_MAX, ATMI_WAIT_STATE);
+ }
- // TODO: async reports errors in the signal, use NE 1
- hsa_signal_wait_scacquire(IdentityCopySignal, HSA_SIGNAL_CONDITION_EQ, 0,
- UINT64_MAX, ATMI_WAIT_STATE);
+ if (got != success) {
+ return HSA_STATUS_ERROR;
+ }
return err;
}
-atmi_status_t Runtime::Memcpy(void *dest, const void *src, size_t size) {
- atmi_status_t ret;
- hsa_status_t err;
+struct atmiFreePtrDeletor {
+ void operator()(void *p) {
+ atmi_free(p); // ignore failure to free
+ }
+};
+
+atmi_status_t Runtime::Memcpy(hsa_signal_t sig, void *dest, const void *src,
+ size_t size) {
ATLData *src_data = g_data_map.find(src);
ATLData *dest_data = g_data_map.find(dest);
atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+
void *temp_host_ptr;
+ atmi_status_t ret = atmi_malloc(&temp_host_ptr, size, cpu);
+ if (ret != ATMI_STATUS_SUCCESS) {
+ return ret;
+ }
+ std::unique_ptr<void, atmiFreePtrDeletor> del(temp_host_ptr);
if (src_data && !dest_data) {
// Copy from device to scratch to host
hsa_agent_t agent = get_mem_agent(src_data->place());
DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle);
- ret = atmi_malloc(&temp_host_ptr, size, cpu);
- if (ret != ATMI_STATUS_SUCCESS) {
- return ret;
- }
- err = invoke_hsa_copy(temp_host_ptr, src, size, agent);
- if (err != HSA_STATUS_SUCCESS) {
+ if (invoke_hsa_copy(sig, temp_host_ptr, src, size, agent) !=
+ HSA_STATUS_SUCCESS) {
return ATMI_STATUS_ERROR;
}
@@ -174,30 +191,24 @@ atmi_status_t Runtime::Memcpy(void *dest, const void *src, size_t size) {
// Copy from host to scratch to device
hsa_agent_t agent = get_mem_agent(dest_data->place());
DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle);
- ret = atmi_malloc(&temp_host_ptr, size, cpu);
- if (ret != ATMI_STATUS_SUCCESS) {
- return ret;
- }
memcpy(temp_host_ptr, src, size);
- DEBUG_PRINT("Memcpy device agent: %lu\n", agent.handle);
- err = invoke_hsa_copy(dest, temp_host_ptr, size, agent);
+ if (invoke_hsa_copy(sig, dest, temp_host_ptr, size, agent) !=
+ HSA_STATUS_SUCCESS) {
+ return ATMI_STATUS_ERROR;
+ }
} else if (!src_data && !dest_data) {
- DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
// would be host to host, just call memcpy, or missing metadata
+ DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
return ATMI_STATUS_ERROR;
} else {
DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n");
return ATMI_STATUS_ERROR;
}
- ret = atmi_free(temp_host_ptr);
-
- if (err != HSA_STATUS_SUCCESS || ret != ATMI_STATUS_SUCCESS)
- ret = ATMI_STATUS_ERROR;
- return ret;
+ return ATMI_STATUS_SUCCESS;
}
} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
index 64548dd4a0f8..ff8ac1c3550b 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
@@ -16,18 +16,6 @@
extern ATLMachine g_atl_machine;
extern hsa_region_t atl_cpu_kernarg_region;
-void *ATLMemory::alloc(size_t sz) {
- void *ret;
- hsa_status_t err = hsa_amd_memory_pool_allocate(memory_pool_, sz, 0, &ret);
- ErrorCheck(Allocate from memory pool, err);
- return ret;
-}
-
-void ATLMemory::free(void *ptr) {
- hsa_status_t err = hsa_amd_memory_pool_free(ptr);
- ErrorCheck(Allocate from memory pool, err);
-}
-
void ATLProcessor::addMemory(const ATLMemory &mem) {
for (auto &mem_obj : memories_) {
// if the memory already exists, then just return
@@ -66,63 +54,3 @@ template <> void ATLMachine::addProcessor(const ATLCPUProcessor &p) {
template <> void ATLMachine::addProcessor(const ATLGPUProcessor &p) {
gpu_processors_.push_back(p);
}
-
-void callbackQueue(hsa_status_t status, hsa_queue_t *source, void *data) {
- if (status != HSA_STATUS_SUCCESS) {
- fprintf(stderr, "[%s:%d] GPU error in queue %p %d\n", __FILE__, __LINE__,
- source, status);
- abort();
- }
-}
-
-void ATLGPUProcessor::createQueues(const int count) {
- int *num_cus = reinterpret_cast<int *>(calloc(count, sizeof(int)));
-
- hsa_status_t err;
- /* Query the maximum size of the queue. */
- uint32_t queue_size = 0;
- err = hsa_agent_get_info(agent_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
- ErrorCheck(Querying the agent maximum queue size, err);
- if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
- queue_size = core::Runtime::getInstance().getMaxQueueSize();
- }
-
- /* Create queues for each device. */
- int qid;
- for (qid = 0; qid < count; qid++) {
- hsa_queue_t *this_Q;
- err =
- hsa_queue_create(agent_, queue_size, HSA_QUEUE_TYPE_MULTI,
- callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &this_Q);
- ErrorCheck(Creating the queue, err);
- err = hsa_amd_profiling_set_profiler_enabled(this_Q, 1);
- ErrorCheck(Enabling profiling support, err);
-
- queues_.push_back(this_Q);
-
- DEBUG_PRINT("Queue[%d]: %p\n", qid, this_Q);
- }
-
- free(num_cus);
-}
-
-void ATLCPUProcessor::createQueues(const int) {}
-
-void ATLProcessor::destroyQueues() {
- for (auto queue : queues_) {
- hsa_status_t err = hsa_queue_destroy(queue);
- ErrorCheck(Destroying the queue, err);
- }
-}
-
-int ATLProcessor::num_cus() const {
- hsa_status_t err;
- /* Query the number of compute units. */
- uint32_t num_cus = 0;
- err = hsa_agent_get_info(
- agent_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
- &num_cus);
- ErrorCheck(Querying the agent number of compute units, err);
-
- return num_cus;
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
index 9ccf67f7e4c5..93169ed4eafb 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
@@ -18,7 +18,6 @@ class ATLProcessor {
explicit ATLProcessor(hsa_agent_t agent,
atmi_devtype_t type = ATMI_DEVTYPE_ALL)
: agent_(agent), type_(type) {
- queues_.clear();
memories_.clear();
}
void addMemory(const ATLMemory &p);
@@ -29,16 +28,9 @@ class ATLProcessor {
const std::vector<ATLMemory> &memories() const;
atmi_devtype_t type() const { return type_; }
- virtual void createQueues(const int count) {}
- virtual void destroyQueues();
- std::vector<hsa_queue_t *> queues() const { return queues_; }
-
- int num_cus() const;
-
protected:
hsa_agent_t agent_;
atmi_devtype_t type_;
- std::vector<hsa_queue_t *> queues_;
std::vector<ATLMemory> memories_;
};
@@ -46,7 +38,6 @@ class ATLCPUProcessor : public ATLProcessor {
public:
explicit ATLCPUProcessor(hsa_agent_t agent)
: ATLProcessor(agent, ATMI_DEVTYPE_CPU) {}
- void createQueues(const int count);
};
class ATLGPUProcessor : public ATLProcessor {
@@ -54,21 +45,16 @@ class ATLGPUProcessor : public ATLProcessor {
explicit ATLGPUProcessor(hsa_agent_t agent,
atmi_devtype_t type = ATMI_DEVTYPE_dGPU)
: ATLProcessor(agent, type) {}
- void createQueues(const int count);
};
class ATLMemory {
public:
ATLMemory(hsa_amd_memory_pool_t pool, ATLProcessor p, atmi_memtype_t t)
: memory_pool_(pool), processor_(p), type_(t) {}
- ATLProcessor &processor() { return processor_; }
hsa_amd_memory_pool_t memory() const { return memory_pool_; }
atmi_memtype_t type() const { return type_; }
- void *alloc(size_t s);
- void free(void *p);
-
private:
hsa_amd_memory_pool_t memory_pool_;
ATLProcessor processor_;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
index 8863c383e500..757919eb3a45 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
@@ -14,29 +14,19 @@
namespace core {
#define DEFAULT_MAX_QUEUE_SIZE 4096
-#define DEFAULT_MAX_KERNEL_TYPES 32
-#define DEFAULT_NUM_GPU_QUEUES -1 // computed in code
-#define DEFAULT_NUM_CPU_QUEUES -1 // computed in code
#define DEFAULT_DEBUG_MODE 0
class Environment {
public:
Environment()
: max_queue_size_(DEFAULT_MAX_QUEUE_SIZE),
- max_kernel_types_(DEFAULT_MAX_KERNEL_TYPES),
- num_gpu_queues_(DEFAULT_NUM_GPU_QUEUES),
- num_cpu_queues_(DEFAULT_NUM_CPU_QUEUES),
debug_mode_(DEFAULT_DEBUG_MODE) {
GetEnvAll();
}
- ~Environment() {}
-
void GetEnvAll();
int getMaxQueueSize() const { return max_queue_size_; }
- int getMaxKernelTypes() const { return max_kernel_types_; }
- int getNumGPUQueues() const { return num_gpu_queues_; }
- int getNumCPUQueues() const { return num_cpu_queues_; }
+
// TODO(ashwinma): int may change to enum if we have more debug modes
int getDebugMode() const { return debug_mode_; }
// TODO(ashwinma): int may change to enum if we have more profile modes
@@ -52,9 +42,6 @@ class Environment {
}
int max_queue_size_;
- int max_kernel_types_;
- int num_gpu_queues_;
- int num_cpu_queues_;
int debug_mode_;
};
@@ -68,7 +55,8 @@ class Runtime final {
// init/finalize
static atmi_status_t Initialize();
static atmi_status_t Finalize();
-
+ // machine info
+ static atmi_machine_t *GetMachineInfo();
// modules
static atmi_status_t RegisterModuleFromMemory(
void *, size_t, atmi_place_t,
@@ -76,19 +64,14 @@ class Runtime final {
void *cb_state),
void *cb_state);
- // machine info
- static atmi_machine_t *GetMachineInfo();
-
// data
- static atmi_status_t Memcpy(void *, const void *, size_t);
+ static atmi_status_t Memcpy(hsa_signal_t, void *, const void *, size_t);
static atmi_status_t Memfree(void *);
static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t);
// environment variables
int getMaxQueueSize() const { return env_.getMaxQueueSize(); }
- int getMaxKernelTypes() const { return env_.getMaxKernelTypes(); }
- int getNumGPUQueues() const { return env_.getNumGPUQueues(); }
- int getNumCPUQueues() const { return env_.getNumCPUQueues(); }
+
// TODO(ashwinma): int may change to enum if we have more debug modes
int getDebugMode() const { return env_.getDebugMode(); }
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index 2c31aafc7624..913dc91b298d 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -164,8 +164,6 @@ int context_init_time_init = 0;
atl_context_t atlc = {.struct_initialized = false};
atl_context_t *atlc_p = NULL;
-hsa_signal_t IdentityCopySignal;
-
namespace core {
/* Machine Info */
atmi_machine_t *Runtime::GetMachineInfo() {
@@ -221,14 +219,6 @@ atmi_status_t Runtime::Finalize() {
ErrorCheck(Destroying executable, err);
}
- // Finalize queues
- for (auto &p : g_atl_machine.processors<ATLCPUProcessor>()) {
- p.destroyQueues();
- }
- for (auto &p : g_atl_machine.processors<ATLGPUProcessor>()) {
- p.destroyQueues();
- }
-
for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
SymbolInfoTable[i].clear();
}
@@ -450,7 +440,6 @@ static hsa_status_t init_compute_and_memory() {
int proc_index = 0;
for (int i = cpus_begin; i < cpus_end; i++) {
all_devices[i].type = cpu_procs[proc_index].type();
- all_devices[i].core_count = cpu_procs[proc_index].num_cus();
std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
int fine_memories_size = 0;
@@ -468,13 +457,11 @@ static hsa_status_t init_compute_and_memory() {
}
DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
- all_devices[i].memory_count = memories.size();
proc_index++;
}
proc_index = 0;
for (int i = gpus_begin; i < gpus_end; i++) {
all_devices[i].type = gpu_procs[proc_index].type();
- all_devices[i].core_count = gpu_procs[proc_index].num_cus();
std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
int fine_memories_size = 0;
@@ -492,7 +479,6 @@ static hsa_status_t init_compute_and_memory() {
}
DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
- all_devices[i].memory_count = memories.size();
proc_index++;
}
proc_index = 0;
@@ -551,8 +537,6 @@ hsa_status_t init_hsa() {
void init_tasks() {
if (atlc.g_tasks_initialized != false)
return;
- hsa_status_t err;
- int task_num;
std::vector<hsa_agent_t> gpu_agents;
int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
for (int gpu = 0; gpu < gpu_count; gpu++) {
@@ -560,8 +544,6 @@ void init_tasks() {
ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
gpu_agents.push_back(proc.agent());
}
- err = hsa_signal_create(0, 0, NULL, &IdentityCopySignal);
- ErrorCheck(Creating a HSA signal, err);
atlc.g_tasks_initialized = true;
}
@@ -616,18 +598,6 @@ atmi_status_t atl_init_gpu_context() {
if (err != HSA_STATUS_SUCCESS)
return ATMI_STATUS_ERROR;
- int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
- for (int gpu = 0; gpu < gpu_count; gpu++) {
- atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
- ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
- int num_gpu_queues = core::Runtime::getInstance().getNumGPUQueues();
- if (num_gpu_queues == -1) {
- num_gpu_queues = proc.num_cus();
- num_gpu_queues = (num_gpu_queues > 8) ? 8 : num_gpu_queues;
- }
- proc.createQueues(num_gpu_queues);
- }
-
if (context_init_time_init == 0) {
clock_gettime(CLOCK_MONOTONIC_RAW, &context_init_time);
context_init_time_init = 1;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
index 8ce6c7bd585c..2aa09ff47e40 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
@@ -97,9 +97,6 @@ void Environment::GetEnvAll() {
std::string var = GetEnv("ATMI_HELP");
if (!var.empty()) {
std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl
- << "ATMI_MAX_KERNEL_TYPES : positive integer" << std::endl
- << "ATMI_DEVICE_GPU_WORKERS : positive integer" << std::endl
- << "ATMI_DEVICE_CPU_WORKERS : positive integer" << std::endl
<< "ATMI_DEBUG : 1 for printing out trace/debug info"
<< std::endl;
exit(0);
@@ -109,26 +106,6 @@ void Environment::GetEnvAll() {
if (!var.empty())
max_queue_size_ = std::stoi(var);
- var = GetEnv("ATMI_MAX_KERNEL_TYPES");
- if (!var.empty())
- max_kernel_types_ = std::stoi(var);
-
- /* TODO: If we get a good use case for device-specific worker count, we
- * should explore it, but let us keep the worker count uniform for all
- * devices of a type until that time
- */
- var = GetEnv("ATMI_DEVICE_GPU_WORKERS");
- if (!var.empty())
- num_gpu_queues_ = std::stoi(var);
-
- /* TODO: If we get a good use case for device-specific worker count, we
- * should explore it, but let us keep the worker count uniform for all
- * devices of a type until that time
- */
- var = GetEnv("ATMI_DEVICE_CPU_WORKERS");
- if (!var.empty())
- num_cpu_queues_ = std::stoi(var);
-
var = GetEnv("ATMI_DEBUG");
if (!var.empty())
debug_mode_ = std::stoi(var);
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index d6f0f9531283..0ec4c2ac918a 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -23,6 +23,9 @@
#include <libelf.h>
#include <list>
#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <thread>
#include <unordered_map>
#include <vector>
@@ -32,8 +35,6 @@
#include "internal.h"
-#include "internal.h"
-
#include "omptargetplugin.h"
// Get static gpu grid values from clang target-specific constants managed
@@ -355,6 +356,10 @@ class RTLDeviceInfoTy {
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
public:
+ // load binary populates symbol tables and mutates various global state
+ // run uses those symbol tables
+ std::shared_timed_mutex load_run_lock;
+
int NumberOfDevices;
// GPU devices
@@ -382,6 +387,16 @@ class RTLDeviceInfoTy {
// Resource pools
SignalPoolT FreeSignalPool;
+ struct atmiFreePtrDeletor {
+ void operator()(void *p) {
+ atmi_free(p); // ignore failure to free
+ }
+ };
+
+ // device_State shared across loaded binaries, error if inconsistent size
+ std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>>
+ deviceStateStore;
+
static const int HardTeamLimit = 1 << 20; // 1 Meg
static const int DefaultNumTeams = 128;
static const int Max_Teams =
@@ -393,6 +408,17 @@ class RTLDeviceInfoTy {
static const int Default_WG_Size =
llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
+ atmi_status_t freesignalpool_memcpy(void *dest, const void *src,
+ size_t size) {
+ hsa_signal_t s = FreeSignalPool.pop();
+ if (s.handle == 0) {
+ return ATMI_STATUS_ERROR;
+ }
+ atmi_status_t r = atmi_memcpy(s, dest, src, size);
+ FreeSignalPool.push(s);
+ return r;
+ }
+
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
@@ -491,6 +517,7 @@ class RTLDeviceInfoTy {
WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);
+ deviceStateStore.resize(NumberOfDevices);
for (int i = 0; i < NumberOfDevices; i++) {
uint32_t queue_size = 0;
@@ -511,6 +538,8 @@ class RTLDeviceInfoTy {
DP("Failed to create HSA queues\n");
return;
}
+
+ deviceStateStore[i] = {nullptr, 0};
}
for (int i = 0; i < NumberOfDevices; i++) {
@@ -553,7 +582,10 @@ class RTLDeviceInfoTy {
~RTLDeviceInfoTy() {
DP("Finalizing the HSA-ATMI DeviceInfo.\n");
- KernelArgPoolMap.clear(); // calls hsa to free memory
+ // Run destructors on types that use HSA before
+ // atmi_finalize removes access to it
+ deviceStateStore.clear();
+ KernelArgPoolMap.clear();
atmi_finalize();
}
};
@@ -583,7 +615,9 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)TgtPtr,
(long long unsigned)(Elf64_Addr)HstPtr);
- err = atmi_memcpy(HstPtr, TgtPtr, (size_t)Size);
+
+ err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size);
+
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -608,7 +642,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)HstPtr,
(long long unsigned)(Elf64_Addr)TgtPtr);
- err = atmi_memcpy(TgtPtr, HstPtr, (size_t)Size);
+ err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size);
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from host to device. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -919,20 +953,45 @@ atmi_status_t module_register_from_memory_to_place(void *module_bytes,
}
} // namespace
+static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
+ uint64_t device_State_bytes = 0;
+ {
+ // If this is the deviceRTL, get the state variable size
+ symbol_info size_si;
+ int rc = get_symbol_info_without_loading(
+ ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
+
+ if (rc == 0) {
+ if (size_si.size != sizeof(uint64_t)) {
+ fprintf(stderr,
+ "Found device_State_size variable with wrong size, aborting\n");
+ exit(1);
+ }
+
+ // Read number of bytes directly from the elf
+ memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
+ }
+ }
+ return device_State_bytes;
+}
+
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+
static __tgt_target_table *
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
__tgt_device_image *image) {
- static pthread_mutex_t load_binary_mutex = PTHREAD_MUTEX_INITIALIZER;
- pthread_mutex_lock(&load_binary_mutex);
+ DeviceInfo.load_run_lock.lock();
__tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
- pthread_mutex_unlock(&load_binary_mutex);
+ DeviceInfo.load_run_lock.unlock();
return res;
}
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
__tgt_device_image *image) {
+
const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
DeviceInfo.clearOffloadEntriesTable(device_id);
@@ -997,6 +1056,63 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
DP("ATMI module successfully loaded!\n");
+ // Zero the pseudo-bss variable by calling into hsa
+ // Do this post-load to handle got
+ uint64_t device_State_bytes =
+ get_device_State_bytes((char *)image->ImageStart, img_size);
+ auto &dss = DeviceInfo.deviceStateStore[device_id];
+ if (device_State_bytes != 0) {
+
+ if (dss.first.get() == nullptr) {
+ assert(dss.second == 0);
+ void *ptr = NULL;
+ atmi_status_t err =
+ atmi_malloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
+ if (err != ATMI_STATUS_SUCCESS) {
+ fprintf(stderr, "Failed to allocate device_state array\n");
+ return NULL;
+ }
+ dss = {std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr},
+ device_State_bytes};
+ }
+
+ void *ptr = dss.first.get();
+ if (device_State_bytes != dss.second) {
+ fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
+ exit(1);
+ }
+
+ void *state_ptr;
+ uint32_t state_ptr_size;
+ err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id),
+ "omptarget_nvptx_device_State",
+ &state_ptr, &state_ptr_size);
+
+ if (err != ATMI_STATUS_SUCCESS) {
+ fprintf(stderr, "failed to find device_state ptr\n");
+ return NULL;
+ }
+ if (state_ptr_size != sizeof(void *)) {
+ fprintf(stderr, "unexpected size of state_ptr %u != %zu\n",
+ state_ptr_size, sizeof(void *));
+ return NULL;
+ }
+
+ // write ptr to device memory so it can be used by later kernels
+ err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *));
+ if (err != ATMI_STATUS_SUCCESS) {
+ fprintf(stderr, "memcpy install of state_ptr failed\n");
+ return NULL;
+ }
+
+ assert((device_State_bytes & 0x3) == 0); // known >= 4 byte aligned
+ hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, device_State_bytes / 4);
+ if (rc != HSA_STATUS_SUCCESS) {
+ fprintf(stderr, "zero fill device_state failed with %u\n", rc);
+ return NULL;
+ }
+ }
+
// TODO: Check with Guansong to understand the below comment more thoroughly.
// Here, we take advantage of the data that is appended after img_end to get
// the symbols' name we need to load. This data consist of the host entries
@@ -1052,7 +1168,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
// If unified memory is present any target link variables
// can access host addresses directly. There is no longer a
// need for device copies.
- err = atmi_memcpy(varptr, e->addr, sizeof(void *));
+ err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *));
if (err != ATMI_STATUS_SUCCESS)
DP("Error when copying USM\n");
DP("Copy linked variable host address (" DPxMOD ")"
@@ -1480,7 +1596,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
void *TgtPtr = NULL;
atmi_status_t err =
atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
- err = atmi_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
+ err =
+ DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
if (print_kernel_trace > 2)
fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
(long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);
@@ -1500,12 +1617,33 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
return packet_id;
}
+static int32_t __tgt_rtl_run_target_team_region_locked(
+ int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+ ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+ int32_t thread_limit, uint64_t loop_tripcount);
+
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args,
ptr
diff _t *tgt_offsets,
int32_t arg_num, int32_t num_teams,
int32_t thread_limit,
uint64_t loop_tripcount) {
+
+ DeviceInfo.load_run_lock.lock_shared();
+ int32_t res = __tgt_rtl_run_target_team_region_locked(
+ device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
+ thread_limit, loop_tripcount);
+
+ DeviceInfo.load_run_lock.unlock_shared();
+ return res;
+}
+
+int32_t __tgt_rtl_run_target_team_region_locked(
+ int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+ ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+ int32_t thread_limit, uint64_t loop_tripcount) {
+ static pthread_mutex_t nested_parallel_mutex = PTHREAD_MUTEX_INITIALIZER;
+
// Set the context we are using
// update thread limit content in gpu memory if un-initialized or specified
// from host
@@ -1541,12 +1679,13 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
);
void *TgtCallStack = NULL;
- if (KernelInfo->MaxParLevel > 0)
+ if (KernelInfo->MaxParLevel > 0) {
+ pthread_mutex_lock(&nested_parallel_mutex);
TgtCallStack = AllocateNestedParallelCallMemory(
KernelInfo->MaxParLevel, num_groups, threadsPerGroup,
KernelInfo->device_id, KernelInfo->CallStackAddr,
KernelInfo->ExecutionMode);
-
+ }
if (print_kernel_trace > 0)
// enum modes are SPMD, GENERIC, NONE 0,1,2
fprintf(stderr,
@@ -1664,8 +1803,10 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
DP("Kernel completed\n");
// Free call stack for nested
- if (TgtCallStack)
+ if (TgtCallStack) {
+ pthread_mutex_unlock(&nested_parallel_mutex);
atmi_free(TgtCallStack);
+ }
return OFFLOAD_SUCCESS;
}
More information about the Openmp-commits
mailing list