[Openmp-commits] [openmp] d0b3129 - [libomptarget] Implement host plugin for amdgpu
via Openmp-commits
openmp-commits at lists.llvm.org
Sat Aug 15 15:58:39 PDT 2020
Author: Jon Chesterfield
Date: 2020-08-15T23:58:28+01:00
New Revision: d0b312955f12beba5b03c8a524a8456cb4174bd7
URL: https://github.com/llvm/llvm-project/commit/d0b312955f12beba5b03c8a524a8456cb4174bd7
DIFF: https://github.com/llvm/llvm-project/commit/d0b312955f12beba5b03c8a524a8456cb4174bd7.diff
LOG: [libomptarget] Implement host plugin for amdgpu
[libomptarget] Implement host plugin for amdgpu
Replacement for D71384. Primary difference is inlining the dependency on atmi
followed by extensive simplification and bugfixes. This is the latest version
from https://github.com/ROCm-Developer-Tools/amd-llvm-project/tree/aomp12 with
minor patches and a rename from hsa to amdgpu, on the basis that this can't be
used by other implementations of hsa without additional work.
This will not build unless the ROCM_DIR variable is passed so won't break other
builds. That variable is used to locate two amdgpu specific libraries that ship
as part of rocm:
libhsakmt at https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface
libhsa-runtime64 at https://github.com/RadeonOpenCompute/ROCR-Runtime
These libraries build from source. The build scripts in those repos are for
shared libraries, but can be adapted to statically link both into this plugin.
There are caveats.
- This works well enough to run various tests and benchmarks, and will be used
to support the current clang bring up
- It is adequately thread safe for the above but there will be races remaining
- It is not stylistically correct for llvm, though has had clang-format run
- It has suboptimal memory management and locking strategies
- The debug printing / error handling is inconsistent
I would like to contribute this pretty much as-is and then improve it in-tree.
This would be advantagous because the aomp12 branch that was in use for fixing
this codebase has just been joined with the amd internal rocm dev process.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D85742
Added:
openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
openmp/libomptarget/plugins/amdgpu/impl/atmi.h
openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
openmp/libomptarget/plugins/amdgpu/impl/data.cpp
openmp/libomptarget/plugins/amdgpu/impl/data.h
openmp/libomptarget/plugins/amdgpu/impl/internal.h
openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
openmp/libomptarget/plugins/amdgpu/impl/machine.h
openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
openmp/libomptarget/plugins/amdgpu/impl/rt.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Modified:
openmp/libomptarget/plugins/CMakeLists.txt
openmp/libomptarget/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
index f8372b619407..a6d291e7758f 100644
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/CMakeLists.txt
@@ -66,6 +66,7 @@ endif()
endmacro()
add_subdirectory(aarch64)
+add_subdirectory(amdgpu)
add_subdirectory(cuda)
add_subdirectory(ppc64)
add_subdirectory(ppc64le)
diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
new file mode 100644
index 000000000000..47ae00ede2ce
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -0,0 +1,84 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an AMDGPU machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+################################################################################
+
+if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+ libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
+ return()
+endif()
+
+if(NOT ROCM_DIR)
+ libomptarget_say("Not building AMDGPU plugin: ROCM_DIR is not set")
+ return()
+endif()
+
+set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa)
+set(LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${ROCM_DIR}/hsa/lib)
+set(LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS ${ROCM_DIR}/lib)
+
+mark_as_advanced( LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS)
+
+if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+ libomptarget_say("Not building amdgpu plugin: only support amdgpu in Linux x86_64, ppc64le, or aarch64 hosts.")
+ return()
+endif()
+libomptarget_say("Building amdgpu offloading plugin using ROCM_DIR = ${ROCM_DIR}")
+
+libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS: ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}")
+libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS}")
+libomptarget_say("LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS: ${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}")
+
+################################################################################
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=AMDGPU)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
+ add_definitions(-DLITTLEENDIAN_CPU=1)
+endif()
+
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ add_definitions(-DDEBUG)
+endif()
+
+include_directories(
+ ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/impl
+)
+
+add_library(omptarget.rtl.amdgpu SHARED
+ impl/atmi.cpp
+ impl/atmi_interop_hsa.cpp
+ impl/data.cpp
+ impl/machine.cpp
+ impl/system.cpp
+ impl/utils.cpp
+ impl/msgpack.cpp
+ src/rtl.cpp
+ )
+
+# Install plugin under the lib destination folder.
+# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug
+install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}")
+
+target_link_libraries(
+ omptarget.rtl.amdgpu
+ -lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR}
+ -L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -L${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS},-rpath,${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}
+ -lelf
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+ "-Wl,-z,defs"
+ )
+
+# Report to the parent scope that we are building a plugin for amdgpu
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
+
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
new file mode 100644
index 000000000000..3856403504a9
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -0,0 +1,44 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "rt.h"
+/*
+ * Initialize/Finalize
+ */
+atmi_status_t atmi_init() { return core::Runtime::Initialize(); }
+
+atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); }
+
+/*
+ * Machine Info
+ */
+atmi_machine_t *atmi_machine_get_info() {
+ return core::Runtime::GetMachineInfo();
+}
+
+/*
+ * Modules
+ */
+atmi_status_t atmi_module_register_from_memory_to_place(
+ void *module_bytes, size_t module_size, atmi_place_t place,
+ atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+ void *cb_state),
+ void *cb_state) {
+ return core::Runtime::getInstance().RegisterModuleFromMemory(
+ module_bytes, module_size, place, on_deserialized_data, cb_state);
+}
+
+/*
+ * Data
+ */
+atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) {
+ return core::Runtime::Memcpy(dest, src, size);
+}
+
+atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
+
+atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
+ return core::Runtime::Malloc(ptr, size, place);
+}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h
new file mode 100644
index 000000000000..bfe95f93dbaf
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.h
@@ -0,0 +1,203 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_H_
+#define INCLUDE_ATMI_H_
+
+#define ROCM_VERSION_MAJOR 3
+#define ROCM_VERSION_MINOR 2
+
+/** \defgroup enumerations Enumerated Types
+ * @{
+ */
+
+/**
+ * @brief Status codes.
+ */
+typedef enum atmi_status_t {
+ /**
+ * The function has been executed successfully.
+ */
+ ATMI_STATUS_SUCCESS = 0,
+ /**
+ * A undocumented error has occurred.
+ */
+ ATMI_STATUS_UNKNOWN = 1,
+ /**
+ * A generic error has occurred.
+ */
+ ATMI_STATUS_ERROR = 2,
+} atmi_status_t;
+
+/**
+ * @brief Device Types.
+ */
+typedef enum atmi_devtype_s {
+ ATMI_DEVTYPE_CPU = 0x0001,
+ ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU
+ ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU
+ ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU
+ ATMI_DEVTYPE_ALL = 0x111 // Union of all device types
+} atmi_devtype_t;
+
+/**
+ * @brief Memory Access Type.
+ */
+typedef enum atmi_memtype_s {
+ ATMI_MEMTYPE_FINE_GRAINED = 0,
+ ATMI_MEMTYPE_COARSE_GRAINED = 1,
+ ATMI_MEMTYPE_ANY
+} atmi_memtype_t;
+
+/**
+ * @brief ATMI Memory Fences for Tasks.
+ */
+typedef enum atmi_task_fence_scope_s {
+ /**
+ * No memory fence applied; external fences have to be applied around the task
+ * launch/completion.
+ */
+ ATMI_FENCE_SCOPE_NONE = 0,
+ /**
+ * The fence is applied to the device.
+ */
+ ATMI_FENCE_SCOPE_DEVICE = 1,
+ /**
+ * The fence is applied to the entire system.
+ */
+ ATMI_FENCE_SCOPE_SYSTEM = 2
+} atmi_task_fence_scope_t;
+
+/** @} */
+
+/** \defgroup common Common ATMI Structures
+ * @{
+ */
+
+/**
+ * @brief ATMI Compute Place
+ */
+typedef struct atmi_place_s {
+ /**
+ * The node in a cluster where computation should occur.
+ * Default is node_id = 0 for local computations.
+ */
+ unsigned int node_id;
+ /**
+ * Device type: CPU, GPU or DSP
+ */
+ atmi_devtype_t type;
+ /**
+ * The device ordinal number ordered by runtime; -1 for any
+ */
+ int device_id;
+} atmi_place_t;
+
+/**
+ * @brief ATMI Memory Place
+ */
+typedef struct atmi_mem_place_s {
+ /**
+ * The node in a cluster where computation should occur.
+ * Default is node_id = 0 for local computations.
+ */
+ unsigned int node_id;
+ /**
+ * Device type: CPU, GPU or DSP
+ */
+ atmi_devtype_t dev_type;
+ /**
+ * The device ordinal number ordered by runtime; -1 for any
+ */
+ int dev_id;
+ // atmi_memtype_t mem_type; // Fine grained or Coarse grained
+ /**
+ * The memory space/region ordinal number ordered by runtime; -1 for any
+ */
+ int mem_id;
+} atmi_mem_place_t;
+
+/**
+ * @brief ATMI Memory Space/region Structure
+ */
+typedef struct atmi_memory_s {
+ /**
+ * Memory capacity
+ */
+ unsigned long int capacity;
+ /**
+ * Memory type
+ */
+ atmi_memtype_t type;
+} atmi_memory_t;
+
+/**
+ * @brief ATMI Device Structure
+ */
+typedef struct atmi_device_s {
+ /**
+ * Device type: CPU, GPU or DSP
+ */
+ atmi_devtype_t type;
+ /**
+ * The number of compute cores
+ */
+ unsigned int core_count;
+ /**
+ * The number of memory spaces/regions that are accessible
+ * from this device
+ */
+ unsigned int memory_count;
+ /**
+ * Array of memory spaces/regions that are accessible
+ * from this device.
+ */
+ atmi_memory_t *memories;
+} atmi_device_t;
+
+/**
+ * @brief ATMI Machine Structure
+ */
+typedef struct atmi_machine_s {
+ /**
+ * The number of devices categorized by the device type
+ */
+ unsigned int device_count_by_type[ATMI_DEVTYPE_ALL];
+ /**
+ * The device structures categorized by the device type
+ */
+ atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL];
+} atmi_machine_t;
+
+// Below are some helper macros that can be used to setup
+// some of the ATMI data structures.
+#define ATMI_PLACE_CPU(node, cpu_id) \
+ { .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id }
+#define ATMI_PLACE_GPU(node, gpu_id) \
+ { .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id }
+#define ATMI_MEM_PLACE_CPU(node, cpu_id) \
+ { \
+ .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
+ .mem_id = -1 \
+ }
+#define ATMI_MEM_PLACE_GPU(node, gpu_id) \
+ { \
+ .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
+ .mem_id = -1 \
+ }
+#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id) \
+ { \
+ .node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
+ .mem_id = cpu_mem_id \
+ }
+#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id) \
+ { \
+ .node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
+ .mem_id = gpu_mem_id \
+ }
+#define ATMI_MEM_PLACE(d_type, d_id, m_id) \
+ { .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id }
+
+#endif // INCLUDE_ATMI_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
new file mode 100644
index 000000000000..ac52b89cb4f0
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
@@ -0,0 +1,96 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "atmi_interop_hsa.h"
+#include "internal.h"
+
+using core::atl_is_atmi_initialized;
+
+atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
+ const char *symbol,
+ void **var_addr,
+ unsigned int *var_size) {
+ /*
+ // Typical usage:
+ void *var_addr;
+ size_t var_size;
+ atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
+ &var_size);
+ atmi_memcpy(host_add, var_addr, var_size);
+ */
+
+ if (!atl_is_atmi_initialized())
+ return ATMI_STATUS_ERROR;
+ atmi_machine_t *machine = atmi_machine_get_info();
+ if (!symbol || !var_addr || !var_size || !machine)
+ return ATMI_STATUS_ERROR;
+ if (place.dev_id < 0 ||
+ place.dev_id >= machine->device_count_by_type[place.dev_type])
+ return ATMI_STATUS_ERROR;
+
+ // get the symbol info
+ std::string symbolStr = std::string(symbol);
+ if (SymbolInfoTable[place.dev_id].find(symbolStr) !=
+ SymbolInfoTable[place.dev_id].end()) {
+ atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr];
+ *var_addr = reinterpret_cast<void *>(info.addr);
+ *var_size = info.size;
+ return ATMI_STATUS_SUCCESS;
+ } else {
+ *var_addr = NULL;
+ *var_size = 0;
+ return ATMI_STATUS_ERROR;
+ }
+}
+
+atmi_status_t atmi_interop_hsa_get_kernel_info(
+ atmi_mem_place_t place, const char *kernel_name,
+ hsa_executable_symbol_info_t kernel_info, uint32_t *value) {
+ /*
+ // Typical usage:
+ uint32_t value;
+ atmi_interop_hsa_get_kernel_addr(gpu_place, "kernel_name",
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+ &val);
+ */
+
+ if (!atl_is_atmi_initialized())
+ return ATMI_STATUS_ERROR;
+ atmi_machine_t *machine = atmi_machine_get_info();
+ if (!kernel_name || !value || !machine)
+ return ATMI_STATUS_ERROR;
+ if (place.dev_id < 0 ||
+ place.dev_id >= machine->device_count_by_type[place.dev_type])
+ return ATMI_STATUS_ERROR;
+
+ atmi_status_t status = ATMI_STATUS_SUCCESS;
+ // get the kernel info
+ std::string kernelStr = std::string(kernel_name);
+ if (KernelInfoTable[place.dev_id].find(kernelStr) !=
+ KernelInfoTable[place.dev_id].end()) {
+ atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr];
+ switch (kernel_info) {
+ case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE:
+ *value = info.group_segment_size;
+ break;
+ case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE:
+ *value = info.private_segment_size;
+ break;
+ case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE:
+ // return the size for non-implicit args
+ *value = info.kernel_segment_size - sizeof(atmi_implicit_args_t);
+ break;
+ default:
+ *value = 0;
+ status = ATMI_STATUS_ERROR;
+ break;
+ }
+ } else {
+ *value = 0;
+ status = ATMI_STATUS_ERROR;
+ }
+
+ return status;
+}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
new file mode 100644
index 000000000000..c0f588215e8a
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
@@ -0,0 +1,86 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_INTEROP_HSA_H_
+#define INCLUDE_ATMI_INTEROP_HSA_H_
+
+#include "atmi_runtime.h"
+#include "hsa.h"
+#include "hsa_ext_amd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/** \defgroup interop_hsa_functions ATMI-HSA Interop
+ * @{
+ */
+
+/**
+ * @brief Get the device address and size of an HSA global symbol
+ *
+ * @detail Use this function to query the device address and size of an HSA
+ * global symbol.
+ * The symbol can be set at by the compiler or by the application writer in a
+ * language-specific manner. This function is meaningful only after calling one
+ * of the @p atmi_module_register functions.
+ *
+ * @param[in] place The ATMI memory place
+ *
+ * @param[in] symbol Pointer to a non-NULL global symbol name
+ *
+ * @param[in] var_addr Pointer to a non-NULL @p void* variable that will
+ * hold the device address of the global symbol object.
+ *
+ * @param[in] var_size Pointer to a non-NULL @p uint variable that will
+ * hold the size of the global symbol object.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
+ * invalid
+ * location in the current node, or if ATMI is not initialized.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
+ const char *symbol,
+ void **var_addr,
+ unsigned int *var_size);
+
+/**
+ * @brief Get the HSA-specific kernel info from a kernel name
+ *
+ * @detail Use this function to query the HSA-specific kernel info from the
+ * kernel name.
+ * This function is meaningful only after calling one
+ * of the @p atmi_module_register functions.
+ *
+ * @param[in] place The ATMI memory place
+ *
+ * @param[in] kernel_name Pointer to a char array with the kernel name
+ *
+ * @param[in] info The
diff erent possible kernel properties
+ *
+ * @param[in] value Pointer to a non-NULL @p uint variable that will
+ * hold the return value of the kernel property.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
+ * invalid
+ * location in the current node, or if ATMI is not initialized.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_interop_hsa_get_kernel_info(
+ atmi_mem_place_t place, const char *kernel_name,
+ hsa_executable_symbol_info_t info, uint32_t *value);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INCLUDE_ATMI_INTEROP_HSA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
new file mode 100644
index 000000000000..4d8323e55651
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_kl.h
@@ -0,0 +1,39 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_KL_H_
+#define INCLUDE_ATMI_KL_H_
+
+#include "atmi.h"
+#ifdef __OPENCL_C_VERSION__
+#include "ockl_hsa.h"
+#endif
+#define MAX_NUM_KERNELS (1024 * 16)
+
+typedef struct atmi_implicit_args_s {
+ unsigned long offset_x;
+ unsigned long offset_y;
+ unsigned long offset_z;
+ unsigned long hostcall_ptr;
+ char num_gpu_queues;
+ unsigned long gpu_queue_ptr;
+ char num_cpu_queues;
+ unsigned long cpu_worker_signals;
+ unsigned long cpu_queue_ptr;
+ unsigned long kernarg_template_ptr;
+ // possible TODO: send signal pool to be used by DAGs on GPU
+ // uint8_t num_signals;
+ // unsigned long signal_ptr;
+} atmi_implicit_args_t;
+
+typedef struct atmi_kernel_enqueue_template_s {
+ unsigned long kernel_handle;
+ hsa_kernel_dispatch_packet_t k_packet;
+ hsa_agent_dispatch_packet_t a_packet;
+ unsigned long kernarg_segment_size;
+ void *kernarg_regions;
+} atmi_kernel_enqueue_template_t;
+
+#endif // INCLUDE_ATMI_KL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
new file mode 100644
index 000000000000..04fddd0b2d61
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -0,0 +1,193 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef INCLUDE_ATMI_RUNTIME_H_
+#define INCLUDE_ATMI_RUNTIME_H_
+
+#include "atmi.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** \defgroup context_functions ATMI Context Setup and Finalize
+ * @{
+ */
+/**
+ * @brief Initialize the ATMI runtime environment.
+ *
+ * @detal All ATMI runtime functions will fail if this function is not called
+ * at least once. The user may initialize
diff erence device types at
diff erent
+ * regions in the program in order for optimization purposes.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_init();
+
+/**
+ * @brief Finalize the ATMI runtime environment.
+ *
+ * @detail ATMI runtime functions will fail if called after finalize.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ */
+atmi_status_t atmi_finalize();
+/** @} */
+
+/** \defgroup module_functions ATMI Module
+ * @{
+ */
+
+/**
+ * @brief Register the ATMI code module from memory on to a specific place
+ * (device).
+ *
+ * @detail Currently, only GPU devices need explicit module registration because
+ * of their specific ISAs that require a separate compilation phase. On the
+ * other
+ * hand, CPU devices execute regular x86 functions that are compiled with the
+ * host program.
+ *
+ * @param[in] module_bytes A memory region that contains the GPU modules
+ * targeting ::AMDGCN platform types. Value cannot be NULL.
+ *
+ * @param[in] module_size Size of module region
+ *
+ * @param[in] place Denotes the execution place (device) on which the module
+ * should be registered and loaded.
+ *
+ * @param[in] on_deserialized_data Callback run on deserialized code object,
+ * before loading it
+ *
+ * @param[in] cb_state void* passed to on_deserialized_data callback
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_module_register_from_memory_to_place(
+ void *module_bytes, size_t module_size, atmi_place_t place,
+ atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+ void *cb_state),
+ void *cb_state);
+
+/** @} */
+
+/** \defgroup machine ATMI Machine
+ * @{
+ */
+/**
+ * @brief ATMI's device discovery function to get the current machine's
+ * topology.
+ *
+ * @detail The @p atmi_machine_t structure is a tree-based representation of the
+ * compute and memory elements in the current node. Once ATMI is initialized,
+ * this function can be called to retrieve the pointer to this global structure.
+ *
+ * @return Returns a pointer to a global structure of tyoe @p atmi_machine_t.
+ * Returns NULL if ATMI is not initialized.
+ */
+atmi_machine_t *atmi_machine_get_info();
+/** @} */
+
+/** \defgroup memory_functions ATMI Data Management
+ * @{
+ */
+/**
+ * @brief Allocate memory from the specified memory place.
+ *
+ * @detail This function allocates memory from the specified memory place. If
+ * the memory
+ * place belongs primarily to the CPU, then the memory will be accessible by
+ * other GPUs and CPUs in the system. If the memory place belongs primarily to a
+ * GPU,
+ * then it cannot be accessed by other devices in the system.
+ *
+ * @param[in] ptr The pointer to the memory that will be allocated.
+ *
+ * @param[in] size The size of the allocation in bytes.
+ *
+ * @param[in] place The memory place in the system to perform the allocation.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
+
+/**
+ * @brief Frees memory that was previously allocated.
+ *
+ * @detail This function frees memory that was previously allocated by calling
+ * @p atmi_malloc. It throws an error otherwise. It is illegal to access a
+ * pointer after a call to this function.
+ *
+ * @param[in] ptr The pointer to the memory that has to be freed.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_free(void *ptr);
+
+/**
+ * @brief Syncrhonously copy memory from the source to destination memory
+ * locations.
+ *
+ * @detail This function assumes that the source and destination regions are
+ * non-overlapping. The runtime determines the memory place of the source and
+ * the
+ * destination and executes the appropriate optimized data movement methodology.
+ *
+ * @param[in] dest The destination pointer previously allocated by a system
+ * allocator or @p atmi_malloc.
+ *
+ * @param[in] src The source pointer previously allocated by a system
+ * allocator or @p atmi_malloc.
+ *
+ * @param[in] size The size of the data to be copied in bytes.
+ *
+ * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
+ *
+ * @retval ::ATMI_STATUS_ERROR The function encountered errors.
+ *
+ * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
+ *
+ */
+atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size);
+
+/** @} */
+
+/** \defgroup cpu_dev_runtime ATMI CPU Device Runtime
+ * @{
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INCLUDE_ATMI_RUNTIME_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
new file mode 100644
index 000000000000..cf5cd8f1b5a1
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
@@ -0,0 +1,203 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "data.h"
+#include "atmi_runtime.h"
+#include "internal.h"
+#include "machine.h"
+#include "rt.h"
+#include <cassert>
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include <thread>
+#include <vector>
+
+using core::TaskImpl;
+extern ATLMachine g_atl_machine;
+extern hsa_signal_t IdentityCopySignal;
+
+namespace core {
+ATLPointerTracker g_data_map; // Track all am pointer allocations.
+void allow_access_to_all_gpu_agents(void *ptr);
+
+const char *getPlaceStr(atmi_devtype_t type) {
+ switch (type) {
+ case ATMI_DEVTYPE_CPU:
+ return "CPU";
+ case ATMI_DEVTYPE_GPU:
+ return "GPU";
+ default:
+ return NULL;
+ }
+}
+
+std::ostream &operator<<(std::ostream &os, const ATLData *ap) {
+ atmi_mem_place_t place = ap->place();
+ os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size()
+ << " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id
+ << ", " << place.mem_id << ")";
+ return os;
+}
+
+void ATLPointerTracker::insert(void *pointer, ATLData *p) {
+ std::lock_guard<std::mutex> l(mutex_);
+
+ DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size());
+ tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p));
+}
+
+void ATLPointerTracker::remove(void *pointer) {
+ std::lock_guard<std::mutex> l(mutex_);
+ DEBUG_PRINT("remove: %p\n", pointer);
+ tracker_.erase(ATLMemoryRange(pointer, 1));
+}
+
+ATLData *ATLPointerTracker::find(const void *pointer) {
+ std::lock_guard<std::mutex> l(mutex_);
+ ATLData *ret = NULL;
+ auto iter = tracker_.find(ATLMemoryRange(pointer, 1));
+ DEBUG_PRINT("find: %p\n", pointer);
+ if (iter != tracker_.end()) // found
+ ret = iter->second;
+ return ret;
+}
+
+ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
+ int dev_id = place.dev_id;
+ switch (place.dev_type) {
+ case ATMI_DEVTYPE_CPU:
+ return g_atl_machine.processors<ATLCPUProcessor>()[dev_id];
+ case ATMI_DEVTYPE_GPU:
+ return g_atl_machine.processors<ATLGPUProcessor>()[dev_id];
+ }
+}
+
+static hsa_agent_t get_mem_agent(atmi_mem_place_t place) {
+ return get_processor_by_mem_place(place).agent();
+}
+
+hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) {
+ ATLProcessor &proc = get_processor_by_mem_place(place);
+ return get_memory_pool(proc, place.mem_id);
+}
+
+void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
+ ATLData *data = new ATLData(ptr, size, place);
+ g_data_map.insert(ptr, data);
+ if (place.dev_type == ATMI_DEVTYPE_CPU)
+ allow_access_to_all_gpu_agents(ptr);
+ // TODO(ashwinma): what if one GPU wants to access another GPU?
+}
+
+atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) {
+ atmi_status_t ret = ATMI_STATUS_SUCCESS;
+ hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place);
+ hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr);
+ ErrorCheck(atmi_malloc, err);
+ DEBUG_PRINT("Malloced [%s %d] %p\n",
+ place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id,
+ *ptr);
+ if (err != HSA_STATUS_SUCCESS)
+ ret = ATMI_STATUS_ERROR;
+
+ register_allocation(*ptr, size, place);
+
+ return ret;
+}
+
+atmi_status_t Runtime::Memfree(void *ptr) {
+ atmi_status_t ret = ATMI_STATUS_SUCCESS;
+ hsa_status_t err;
+ ATLData *data = g_data_map.find(ptr);
+ if (!data)
+ ErrorCheck(Checking pointer info userData,
+ HSA_STATUS_ERROR_INVALID_ALLOCATION);
+
+ g_data_map.remove(ptr);
+ delete data;
+
+ err = hsa_amd_memory_pool_free(ptr);
+ ErrorCheck(atmi_free, err);
+ DEBUG_PRINT("Freed %p\n", ptr);
+
+ if (err != HSA_STATUS_SUCCESS || !data)
+ ret = ATMI_STATUS_ERROR;
+ return ret;
+}
+
+static hsa_status_t invoke_hsa_copy(void *dest, const void *src, size_t size,
+ hsa_agent_t agent) {
+ // TODO: Use thread safe signal
+ hsa_signal_store_release(IdentityCopySignal, 1);
+
+ hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
+ NULL, IdentityCopySignal);
+ ErrorCheck(Copy async between memory pools, err);
+
+ // TODO: async reports errors in the signal, use NE 1
+ hsa_signal_wait_acquire(IdentityCopySignal, HSA_SIGNAL_CONDITION_EQ, 0,
+ UINT64_MAX, ATMI_WAIT_STATE);
+
+ return err;
+}
+
+atmi_status_t Runtime::Memcpy(void *dest, const void *src, size_t size) {
+ atmi_status_t ret;
+ hsa_status_t err;
+ ATLData *src_data = g_data_map.find(src);
+ ATLData *dest_data = g_data_map.find(dest);
+ atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+ void *temp_host_ptr;
+
+ if (src_data && !dest_data) {
+ // Copy from device to scratch to host
+ hsa_agent_t agent = get_mem_agent(src_data->place());
+ DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle);
+ ret = atmi_malloc(&temp_host_ptr, size, cpu);
+ if (ret != ATMI_STATUS_SUCCESS) {
+ return ret;
+ }
+
+ err = invoke_hsa_copy(temp_host_ptr, src, size, agent);
+ if (err != HSA_STATUS_SUCCESS) {
+ return ATMI_STATUS_ERROR;
+ }
+
+ memcpy(dest, temp_host_ptr, size);
+
+ } else if (!src_data && dest_data) {
+ // Copy from host to scratch to device
+ hsa_agent_t agent = get_mem_agent(dest_data->place());
+ DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle);
+ ret = atmi_malloc(&temp_host_ptr, size, cpu);
+ if (ret != ATMI_STATUS_SUCCESS) {
+ return ret;
+ }
+
+ memcpy(temp_host_ptr, src, size);
+
+ DEBUG_PRINT("Memcpy device agent: %lu\n", agent.handle);
+ err = invoke_hsa_copy(dest, temp_host_ptr, size, agent);
+
+ } else if (!src_data && !dest_data) {
+ DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
+ // would be host to host, just call memcpy, or missing metadata
+ return ATMI_STATUS_ERROR;
+ } else {
+ DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n");
+ return ATMI_STATUS_ERROR;
+ }
+
+ ret = atmi_free(temp_host_ptr);
+
+ if (err != HSA_STATUS_SUCCESS || ret != ATMI_STATUS_SUCCESS)
+ ret = ATMI_STATUS_ERROR;
+ return ret;
+}
+
+} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.h b/openmp/libomptarget/plugins/amdgpu/impl/data.h
new file mode 100644
index 000000000000..fa9e7380bf67
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/data.h
@@ -0,0 +1,83 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_DATA_H_
+#define SRC_RUNTIME_INCLUDE_DATA_H_
+#include "atmi.h"
+#include <hsa.h>
+#include <map>
+#include <mutex>
+#include <stdio.h>
+#include <stdlib.h>
+// we maintain our own mapping of device addr to a user specified data object
+// in order to work around a (possibly historic) bug in ROCr's
+// hsa_amd_pointer_info_set_userdata for variable symbols
+// this is expected to be temporary
+
+namespace core {
+// Internal representation of any data that is created and managed by ATMI.
+// Data can be located on any device memory or host memory.
+class ATLData {
+public:
+ ATLData(void *ptr, size_t size, atmi_mem_place_t place)
+ : ptr_(ptr), size_(size), place_(place) {}
+
+ void *ptr() const { return ptr_; }
+ size_t size() const { return size_; }
+ atmi_mem_place_t place() const { return place_; }
+
+private:
+ void *ptr_;
+ size_t size_;
+ atmi_mem_place_t place_;
+};
+
+//---
+struct ATLMemoryRange {
+ const void *base_pointer;
+ const void *end_pointer;
+ ATLMemoryRange(const void *bp, size_t size_bytes)
+ : base_pointer(bp),
+ end_pointer(reinterpret_cast<const unsigned char *>(bp) + size_bytes -
+ 1) {}
+};
+
+// Functor to compare ranges:
+struct ATLMemoryRangeCompare {
+ // Return true is LHS range is less than RHS - used to order the ranges
+ bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const {
+ return lhs.end_pointer < rhs.base_pointer;
+ }
+};
+
+//-------------------------------------------------------------------------------------------------
+// This structure tracks information for each pointer.
+// Uses memory-range-based lookups - so pointers that exist anywhere in the
+// range of hostPtr + size
+// will find the associated ATLPointerInfo.
+// The insertions and lookups use a self-balancing binary tree and should
+// support O(logN) lookup speed.
+// The structure is thread-safe - writers obtain a mutex before modifying the
+// tree. Multiple simulatenous readers are supported.
+class ATLPointerTracker {
+ typedef std::map<ATLMemoryRange, ATLData *, ATLMemoryRangeCompare>
+ MapTrackerType;
+
+public:
+ void insert(void *pointer, ATLData *data);
+ void remove(void *pointer);
+ ATLData *find(const void *pointer);
+
+private:
+ MapTrackerType tracker_;
+ std::mutex mutex_;
+};
+
+extern ATLPointerTracker g_data_map; // Track all am pointer allocations.
+
+enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H };
+
+} // namespace core
+#endif // SRC_RUNTIME_INCLUDE_DATA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
new file mode 100644
index 000000000000..1b1d69328785
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -0,0 +1,266 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
+#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
+#include <inttypes.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atomic>
+#include <cstring>
+#include <deque>
+#include <map>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "hsa.h"
+#include "hsa_ext_amd.h"
+#include "hsa_ext_finalize.h"
+
+#include "atmi.h"
+#include "atmi_runtime.h"
+#include "rt.h"
+
+#define MAX_NUM_KERNELS (1024 * 16)
+
+typedef struct atmi_implicit_args_s {
+ unsigned long offset_x;
+ unsigned long offset_y;
+ unsigned long offset_z;
+ unsigned long hostcall_ptr;
+ char num_gpu_queues;
+ unsigned long gpu_queue_ptr;
+ char num_cpu_queues;
+ unsigned long cpu_worker_signals;
+ unsigned long cpu_queue_ptr;
+ unsigned long kernarg_template_ptr;
+} atmi_implicit_args_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define check(msg, status) \
+ if (status != HSA_STATUS_SUCCESS) { \
+ printf("%s failed.\n", #msg); \
+ exit(1); \
+ }
+
+#ifdef DEBUG
+#define DEBUG_PRINT(fmt, ...) \
+ if (core::Runtime::getInstance().getDebugMode()) { \
+ fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \
+ }
+#else
+#define DEBUG_PRINT(...) \
+ do { \
+ } while (false)
+#endif
+
+#ifndef HSA_RUNTIME_INC_HSA_H_
+typedef struct hsa_signal_s {
+ uint64_t handle;
+} hsa_signal_t;
+#endif
+
+/* All global values go in this global structure */
+typedef struct atl_context_s {
+ bool struct_initialized;
+ bool g_hsa_initialized;
+ bool g_gpu_initialized;
+ bool g_tasks_initialized;
+} atl_context_t;
+extern atl_context_t atlc;
+extern atl_context_t *atlc_p;
+
+#ifdef __cplusplus
+}
+#endif
+
+/* ---------------------------------------------------------------------------------
+ * Simulated CPU Data Structures and API
+ * ---------------------------------------------------------------------------------
+ */
+
+#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED
+
+// ---------------------- Kernel Start -------------
+typedef struct atl_kernel_info_s {
+ uint64_t kernel_object;
+ uint32_t group_segment_size;
+ uint32_t private_segment_size;
+ uint32_t kernel_segment_size;
+ uint32_t num_args;
+ std::vector<uint64_t> arg_alignments;
+ std::vector<uint64_t> arg_offsets;
+ std::vector<uint64_t> arg_sizes;
+} atl_kernel_info_t;
+
+typedef struct atl_symbol_info_s {
+ uint64_t addr;
+ uint32_t size;
+} atl_symbol_info_t;
+
+extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
+extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
+
+// ---------------------- Kernel End -------------
+
+extern struct timespec context_init_time;
+
+namespace core {
+class TaskgroupImpl;
+class TaskImpl;
+class Kernel;
+class KernelImpl;
+} // namespace core
+
+struct SignalPoolT {
+ SignalPoolT() {
+ // If no signals are created, and none can be created later,
+ // will ultimately fail at pop()
+
+ unsigned N = 1024; // default max pool size from atmi
+ for (unsigned i = 0; i < N; i++) {
+ hsa_signal_t new_signal;
+ hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
+ if (err != HSA_STATUS_SUCCESS) {
+ break;
+ }
+ state.push(new_signal);
+ }
+ DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size());
+ }
+ SignalPoolT(const SignalPoolT &) = delete;
+ SignalPoolT(SignalPoolT &&) = delete;
+ ~SignalPoolT() {
+ size_t N = state.size();
+ for (size_t i = 0; i < N; i++) {
+ hsa_signal_t signal = state.front();
+ state.pop();
+ hsa_status_t rc = hsa_signal_destroy(signal);
+ if (rc != HSA_STATUS_SUCCESS) {
+ DEBUG_PRINT("Signal pool destruction failed\n");
+ }
+ }
+ }
+ size_t size() {
+ lock l(&mutex);
+ return state.size();
+ }
+ void push(hsa_signal_t s) {
+ lock l(&mutex);
+ state.push(s);
+ }
+ hsa_signal_t pop(void) {
+ lock l(&mutex);
+ if (!state.empty()) {
+ hsa_signal_t res = state.front();
+ state.pop();
+ return res;
+ }
+
+ // Pool empty, attempt to create another signal
+ hsa_signal_t new_signal;
+ hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
+ if (err == HSA_STATUS_SUCCESS) {
+ return new_signal;
+ }
+
+ // Fail
+ return {0};
+ }
+
+private:
+ static pthread_mutex_t mutex;
+ std::queue<hsa_signal_t> state;
+ struct lock {
+ lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
+ ~lock() { pthread_mutex_unlock(m); }
+ pthread_mutex_t *m;
+ };
+};
+
+extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
+
+namespace core {
+atmi_status_t atl_init_gpu_context();
+
+hsa_status_t init_hsa();
+hsa_status_t finalize_hsa();
+/*
+ * Generic utils
+ */
+template <typename T> inline T alignDown(T value, size_t alignment) {
+ return (T)(value & ~(alignment - 1));
+}
+
+template <typename T> inline T *alignDown(T *value, size_t alignment) {
+ return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
+}
+
+template <typename T> inline T alignUp(T value, size_t alignment) {
+ return alignDown((T)(value + alignment - 1), alignment);
+}
+
+template <typename T> inline T *alignUp(T *value, size_t alignment) {
+ return reinterpret_cast<T *>(
+ alignDown((intptr_t)(value + alignment - 1), alignment));
+}
+
+extern void register_allocation(void *addr, size_t size,
+ atmi_mem_place_t place);
+extern hsa_amd_memory_pool_t
+get_memory_pool_by_mem_place(atmi_mem_place_t place);
+extern bool atl_is_atmi_initialized();
+
+bool handle_group_signal(hsa_signal_value_t value, void *arg);
+
+void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest);
+uint16_t
+create_header(hsa_packet_type_t type, int barrier,
+ atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM,
+ atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM);
+
+void allow_access_to_all_gpu_agents(void *ptr);
+} // namespace core
+
+const char *get_error_string(hsa_status_t err);
+const char *get_atmi_error_string(atmi_status_t err);
+
+#define ATMIErrorCheck(msg, status) \
+ if (status != ATMI_STATUS_SUCCESS) { \
+ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
+ get_atmi_error_string(status)); \
+ exit(1); \
+ } else { \
+ /* printf("%s succeeded.\n", #msg);*/ \
+ }
+
+#define ErrorCheck(msg, status) \
+ if (status != HSA_STATUS_SUCCESS) { \
+ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
+ get_error_string(status)); \
+ exit(1); \
+ } else { \
+ /* printf("%s succeeded.\n", #msg);*/ \
+ }
+
+#define ErrorCheckAndContinue(msg, status) \
+ if (status != HSA_STATUS_SUCCESS) { \
+ DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg, \
+ get_error_string(status)); \
+ continue; \
+ } else { \
+ /* printf("%s succeeded.\n", #msg);*/ \
+ }
+
+#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
new file mode 100644
index 000000000000..64548dd4a0f8
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.cpp
@@ -0,0 +1,128 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "machine.h"
+#include "atmi_runtime.h"
+#include "internal.h"
+#include <cassert>
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+extern ATLMachine g_atl_machine;
+extern hsa_region_t atl_cpu_kernarg_region;
+
+void *ATLMemory::alloc(size_t sz) {
+ void *ret;
+ hsa_status_t err = hsa_amd_memory_pool_allocate(memory_pool_, sz, 0, &ret);
+ ErrorCheck(Allocate from memory pool, err);
+ return ret;
+}
+
+void ATLMemory::free(void *ptr) {
+ hsa_status_t err = hsa_amd_memory_pool_free(ptr);
+ ErrorCheck(Allocate from memory pool, err);
+}
+
+void ATLProcessor::addMemory(const ATLMemory &mem) {
+ for (auto &mem_obj : memories_) {
+ // if the memory already exists, then just return
+ if (mem.memory().handle == mem_obj.memory().handle)
+ return;
+ }
+ memories_.push_back(mem);
+}
+
+const std::vector<ATLMemory> &ATLProcessor::memories() const {
+ return memories_;
+}
+
+template <> std::vector<ATLCPUProcessor> &ATLMachine::processors() {
+ return cpu_processors_;
+}
+
+template <> std::vector<ATLGPUProcessor> &ATLMachine::processors() {
+ return gpu_processors_;
+}
+
+hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
+ const int mem_id) {
+ hsa_amd_memory_pool_t pool;
+ const std::vector<ATLMemory> &mems = proc.memories();
+ assert(mems.size() && mem_id >= 0 && mem_id < mems.size() &&
+ "Invalid memory pools for this processor");
+ pool = mems[mem_id].memory();
+ return pool;
+}
+
+template <> void ATLMachine::addProcessor(const ATLCPUProcessor &p) {
+ cpu_processors_.push_back(p);
+}
+
+template <> void ATLMachine::addProcessor(const ATLGPUProcessor &p) {
+ gpu_processors_.push_back(p);
+}
+
+void callbackQueue(hsa_status_t status, hsa_queue_t *source, void *data) {
+ if (status != HSA_STATUS_SUCCESS) {
+ fprintf(stderr, "[%s:%d] GPU error in queue %p %d\n", __FILE__, __LINE__,
+ source, status);
+ abort();
+ }
+}
+
+void ATLGPUProcessor::createQueues(const int count) {
+ int *num_cus = reinterpret_cast<int *>(calloc(count, sizeof(int)));
+
+ hsa_status_t err;
+ /* Query the maximum size of the queue. */
+ uint32_t queue_size = 0;
+ err = hsa_agent_get_info(agent_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
+ ErrorCheck(Querying the agent maximum queue size, err);
+ if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
+ queue_size = core::Runtime::getInstance().getMaxQueueSize();
+ }
+
+ /* Create queues for each device. */
+ int qid;
+ for (qid = 0; qid < count; qid++) {
+ hsa_queue_t *this_Q;
+ err =
+ hsa_queue_create(agent_, queue_size, HSA_QUEUE_TYPE_MULTI,
+ callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &this_Q);
+ ErrorCheck(Creating the queue, err);
+ err = hsa_amd_profiling_set_profiler_enabled(this_Q, 1);
+ ErrorCheck(Enabling profiling support, err);
+
+ queues_.push_back(this_Q);
+
+ DEBUG_PRINT("Queue[%d]: %p\n", qid, this_Q);
+ }
+
+ free(num_cus);
+}
+
+void ATLCPUProcessor::createQueues(const int) {}
+
+void ATLProcessor::destroyQueues() {
+ for (auto queue : queues_) {
+ hsa_status_t err = hsa_queue_destroy(queue);
+ ErrorCheck(Destroying the queue, err);
+ }
+}
+
+int ATLProcessor::num_cus() const {
+ hsa_status_t err;
+ /* Query the number of compute units. */
+ uint32_t num_cus = 0;
+ err = hsa_agent_get_info(
+ agent_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+ &num_cus);
+ ErrorCheck(Querying the agent number of compute units, err);
+
+ return num_cus;
+}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
new file mode 100644
index 000000000000..9ccf67f7e4c5
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
@@ -0,0 +1,109 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_MACHINE_H_
+#define SRC_RUNTIME_INCLUDE_MACHINE_H_
+#include "atmi.h"
+#include "internal.h"
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <vector>
+
+class ATLMemory;
+
+class ATLProcessor {
+public:
+ explicit ATLProcessor(hsa_agent_t agent,
+ atmi_devtype_t type = ATMI_DEVTYPE_ALL)
+ : agent_(agent), type_(type) {
+ queues_.clear();
+ memories_.clear();
+ }
+ void addMemory(const ATLMemory &p);
+ hsa_agent_t agent() const { return agent_; }
+ // TODO(ashwinma): Do we need this or are we building the machine structure
+ // just once in the program?
+ // void removeMemory(ATLMemory &p);
+ const std::vector<ATLMemory> &memories() const;
+ atmi_devtype_t type() const { return type_; }
+
+ virtual void createQueues(const int count) {}
+ virtual void destroyQueues();
+ std::vector<hsa_queue_t *> queues() const { return queues_; }
+
+ int num_cus() const;
+
+protected:
+ hsa_agent_t agent_;
+ atmi_devtype_t type_;
+ std::vector<hsa_queue_t *> queues_;
+ std::vector<ATLMemory> memories_;
+};
+
+class ATLCPUProcessor : public ATLProcessor {
+public:
+ explicit ATLCPUProcessor(hsa_agent_t agent)
+ : ATLProcessor(agent, ATMI_DEVTYPE_CPU) {}
+ void createQueues(const int count);
+};
+
+class ATLGPUProcessor : public ATLProcessor {
+public:
+ explicit ATLGPUProcessor(hsa_agent_t agent,
+ atmi_devtype_t type = ATMI_DEVTYPE_dGPU)
+ : ATLProcessor(agent, type) {}
+ void createQueues(const int count);
+};
+
+class ATLMemory {
+public:
+ ATLMemory(hsa_amd_memory_pool_t pool, ATLProcessor p, atmi_memtype_t t)
+ : memory_pool_(pool), processor_(p), type_(t) {}
+ ATLProcessor &processor() { return processor_; }
+ hsa_amd_memory_pool_t memory() const { return memory_pool_; }
+
+ atmi_memtype_t type() const { return type_; }
+
+ void *alloc(size_t s);
+ void free(void *p);
+
+private:
+ hsa_amd_memory_pool_t memory_pool_;
+ ATLProcessor processor_;
+ atmi_memtype_t type_;
+};
+
+class ATLMachine {
+public:
+ ATLMachine() {
+ cpu_processors_.clear();
+ gpu_processors_.clear();
+ }
+ template <typename T> void addProcessor(const T &p);
+ template <typename T> std::vector<T> &processors();
+ template <typename T> size_t processorCount() {
+ return processors<T>().size();
+ }
+
+private:
+ std::vector<ATLCPUProcessor> cpu_processors_;
+ std::vector<ATLGPUProcessor> gpu_processors_;
+};
+
+hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
+ const int mem_id);
+
+extern ATLMachine g_atl_machine;
+template <typename T> T &get_processor(atmi_place_t place) {
+ int dev_id = place.device_id;
+ if (dev_id == -1) {
+ // user is asking runtime to pick a device
+ // TODO(ashwinma): best device of this type? pick 0 for now
+ dev_id = 0;
+ }
+ return g_atl_machine.processors<T>()[dev_id];
+}
+
+#endif // SRC_RUNTIME_INCLUDE_MACHINE_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
new file mode 100644
index 000000000000..6da12f937034
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
@@ -0,0 +1,264 @@
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <string>
+
+#include "msgpack.h"
+
+namespace msgpack {
+
+[[noreturn]] void internal_error() {
+ printf("internal error\n");
+ exit(1);
+}
+
+const char *type_name(type ty) {
+ switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
+ case NAME: \
+ return #NAME;
+#include "msgpack.def"
+#undef X
+ }
+ internal_error();
+}
+
+unsigned bytes_used_fixed(msgpack::type ty) {
+ using namespace msgpack;
+ switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
+ case NAME: \
+ return WIDTH;
+#include "msgpack.def"
+#undef X
+ }
+ internal_error();
+}
+
+msgpack::type parse_type(unsigned char x) {
+
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
+ if (x >= LOWER && x <= UPPER) { \
+ return NAME; \
+ } else
+#include "msgpack.def"
+#undef X
+ { internal_error(); }
+}
+
+template <typename T, typename R> R bitcast(T x) {
+ static_assert(sizeof(T) == sizeof(R), "");
+ R tmp;
+ memcpy(&tmp, &x, sizeof(T));
+ return tmp;
+}
+template int64_t bitcast<uint64_t, int64_t>(uint64_t);
+} // namespace msgpack
+
+// Helper functions for reading additional payload from the header
+// Depending on the type, this can be a number of bytes, elements,
+// key-value pairs or an embedded integer.
+// Each takes a pointer to the start of the header and returns a uint64_t
+
+namespace {
+namespace payload {
+uint64_t read_zero(const unsigned char *) { return 0; }
+
+// Read the first byte and zero/sign extend it
+uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
+uint64_t read_embedded_s8(const unsigned char *start) {
+ int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
+ return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+
+// Read a masked part of the first byte
+uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
+uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
+uint64_t read_via_mask_0x1f(const unsigned char *start) {
+ return *start & 0x1fu;
+}
+
+// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
+// Big endian format.
+uint64_t read_size_field_u8(const unsigned char *from) {
+ from++;
+ return from[0];
+}
+
+// TODO: detect whether host is little endian or not, and whether the intrinsic
+// is available. And probably use the builtin to test the diy
+const bool use_bswap = false;
+
+uint64_t read_size_field_u16(const unsigned char *from) {
+ from++;
+ if (use_bswap) {
+ uint16_t b;
+ memcpy(&b, from, 2);
+ return __builtin_bswap16(b);
+ } else {
+ return (from[0] << 8u) | from[1];
+ }
+}
+uint64_t read_size_field_u32(const unsigned char *from) {
+ from++;
+ if (use_bswap) {
+ uint32_t b;
+ memcpy(&b, from, 4);
+ return __builtin_bswap32(b);
+ } else {
+ return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
+ (from[3] << 0u);
+ }
+}
+uint64_t read_size_field_u64(const unsigned char *from) {
+ from++;
+ if (use_bswap) {
+ uint64_t b;
+ memcpy(&b, from, 8);
+ return __builtin_bswap64(b);
+ } else {
+ return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
+ ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
+ (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
+ (from[7] << 0u);
+ }
+}
+
+uint64_t read_size_field_s8(const unsigned char *from) {
+ uint8_t u = read_size_field_u8(from);
+ int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
+ return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s16(const unsigned char *from) {
+ uint16_t u = read_size_field_u16(from);
+ int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
+ return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s32(const unsigned char *from) {
+ uint32_t u = read_size_field_u32(from);
+ int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
+ return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+uint64_t read_size_field_s64(const unsigned char *from) {
+ uint64_t u = read_size_field_u64(from);
+ int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
+ return msgpack::bitcast<int64_t, uint64_t>(res);
+}
+} // namespace payload
+} // namespace
+
+namespace msgpack {
+
+payload_info_t payload_info(msgpack::type ty) {
+ using namespace msgpack;
+ switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
+ case NAME: \
+ return payload::PAYLOAD;
+#include "msgpack.def"
+#undef X
+ }
+ internal_error();
+}
+
+} // namespace msgpack
+
+const unsigned char *msgpack::skip_next_message(const unsigned char *start,
+ const unsigned char *end) {
+ class f : public functors_defaults<f> {};
+ return handle_msgpack({start, end}, f());
+}
+
+namespace msgpack {
+bool message_is_string(byte_range bytes, const char *needle) {
+ bool matched = false;
+ size_t needleN = strlen(needle);
+
+ foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
+ if (N == needleN) {
+ if (memcmp(needle, str, N) == 0) {
+ matched = true;
+ }
+ }
+ });
+ return matched;
+}
+
+void dump(byte_range bytes) {
+ struct inner : functors_defaults<inner> {
+ inner(unsigned indent) : indent(indent) {}
+ const unsigned by = 2;
+ unsigned indent = 0;
+
+ void handle_string(size_t N, const unsigned char *bytes) {
+ char *tmp = (char *)malloc(N + 1);
+ memcpy(tmp, bytes, N);
+ tmp[N] = '\0';
+ printf("\"%s\"", tmp);
+ free(tmp);
+ }
+
+ void handle_signed(int64_t x) { printf("%ld", x); }
+ void handle_unsigned(uint64_t x) { printf("%lu", x); }
+
+ const unsigned char *handle_array(uint64_t N, byte_range bytes) {
+ printf("\n%*s[\n", indent, "");
+ indent += by;
+
+ for (uint64_t i = 0; i < N; i++) {
+ indent += by;
+ printf("%*s", indent, "");
+ const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
+ printf(",\n");
+ indent -= by;
+ bytes.start = next;
+ if (!next) {
+ break;
+ }
+ }
+ indent -= by;
+ printf("%*s]", indent, "");
+
+ return bytes.start;
+ }
+
+ const unsigned char *handle_map(uint64_t N, byte_range bytes) {
+ printf("\n%*s{\n", indent, "");
+ indent += by;
+
+ for (uint64_t i = 0; i < 2 * N; i += 2) {
+ const unsigned char *start_key = bytes.start;
+ printf("%*s", indent, "");
+ const unsigned char *end_key =
+ handle_msgpack<inner>({start_key, bytes.end}, {indent});
+ if (!end_key) {
+ break;
+ }
+
+ printf(" : ");
+
+ const unsigned char *start_value = end_key;
+ const unsigned char *end_value =
+ handle_msgpack<inner>({start_value, bytes.end}, {indent});
+
+ if (!end_value) {
+ break;
+ }
+
+ printf(",\n");
+ bytes.start = end_value;
+ }
+
+ indent -= by;
+ printf("%*s}", indent, "");
+
+ return bytes.start;
+ }
+ };
+
+ handle_msgpack<inner>(bytes, {0});
+ printf("\n");
+}
+
+} // namespace msgpack
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
new file mode 100644
index 000000000000..a686c5a2f6ec
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
@@ -0,0 +1,38 @@
+// name, header width, reader, [lower, upper] encoding
+X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
+X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
+X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
+X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
+X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
+X(nil, 1, read_zero, 0xc0, 0xc0)
+X(never_used, 1, read_zero, 0xc1, 0xc1)
+X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
+X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
+X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
+X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
+X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
+X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
+X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
+X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
+X(float32, 5, read_zero, 0xca, 0xca)
+X(float64, 9, read_zero, 0xcb, 0xcb)
+X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
+X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
+X(uint32, 5, read_size_field_u32, 0xce, 0xce)
+X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
+X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
+X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
+X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
+X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
+X(fixext1, 3, read_zero, 0xd4, 0xd4)
+X(fixext2, 4, read_zero, 0xd5, 0xd5)
+X(fixext4, 6, read_zero, 0xd6, 0xd6)
+X(fixext8, 10, read_zero, 0xd7, 0xd7)
+X(fixext16, 18, read_zero, 0xd8, 0xd8)
+X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
+X(str16, 3, read_size_field_u16, 0xda, 0xda)
+X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
+X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
+X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
+X(map16, 3, read_size_field_u16, 0xde, 0xde)
+X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
new file mode 100644
index 000000000000..45f11d3ba45d
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
@@ -0,0 +1,275 @@
+#ifndef MSGPACK_H
+#define MSGPACK_H
+
+#include <functional>
+
+namespace msgpack {
+
+// The message pack format is dynamically typed, schema-less. Format is:
+// message: [type][header][payload]
+// where type is one byte, header length is a fixed length function of type
+// payload is zero to N bytes, with the length encoded in [type][header]
+
+// Scalar fields include boolean, signed integer, float, string etc
+// Composite types are sequences of messages
+// Array field is [header][element][element]...
+// Map field is [header][key][value][key][value]...
+
+// Multibyte integer fields are big endian encoded
+// The map key can be any message type
+// Maps may contain duplicate keys
+// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
+// in as many as nine, as signed or unsigned. Implementation defined.
+// Similarly "foo" may embed the length in the type field or in multiple bytes
+
+// This parser is structured as an iterator over a sequence of bytes.
+// It calls a user provided function on each message in order to extract fields
+// The default implementation for each scalar type is to do nothing. For map or
+// arrays, the default implementation returns just after that message to support
+// iterating to the next message, but otherwise has no effect.
+
+struct byte_range {
+ const unsigned char *start;
+ const unsigned char *end;
+};
+
+const unsigned char *skip_next_message(const unsigned char *start,
+ const unsigned char *end);
+
+template <typename Derived> class functors_defaults {
+public:
+ void cb_string(size_t N, const unsigned char *str) {
+ derived().handle_string(N, str);
+ }
+ void cb_boolean(bool x) { derived().handle_boolean(x); }
+ void cb_signed(int64_t x) { derived().handle_signed(x); }
+ void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
+ void cb_array_elements(byte_range bytes) {
+ derived().handle_array_elements(bytes);
+ }
+ void cb_map_elements(byte_range key, byte_range value) {
+ derived().handle_map_elements(key, value);
+ }
+ const unsigned char *cb_array(uint64_t N, byte_range bytes) {
+ return derived().handle_array(N, bytes);
+ }
+ const unsigned char *cb_map(uint64_t N, byte_range bytes) {
+ return derived().handle_map(N, bytes);
+ }
+
+private:
+ Derived &derived() { return *static_cast<Derived *>(this); }
+
+ // Default implementations for scalar ops are no-ops
+ void handle_string(size_t, const unsigned char *) {}
+ void handle_boolean(bool) {}
+ void handle_signed(int64_t) {}
+ void handle_unsigned(uint64_t) {}
+ void handle_array_elements(byte_range) {}
+ void handle_map_elements(byte_range, byte_range) {}
+
+ // Default implementation for sequences is to skip over the messages
+ const unsigned char *handle_array(uint64_t N, byte_range bytes) {
+ for (uint64_t i = 0; i < N; i++) {
+ const unsigned char *next = skip_next_message(bytes.start, bytes.end);
+ if (!next) {
+ return nullptr;
+ }
+ cb_array_elements(bytes);
+ bytes.start = next;
+ }
+ return bytes.start;
+ }
+ const unsigned char *handle_map(uint64_t N, byte_range bytes) {
+ for (uint64_t i = 0; i < N; i++) {
+ const unsigned char *start_key = bytes.start;
+ const unsigned char *end_key = skip_next_message(start_key, bytes.end);
+ if (!end_key) {
+ return nullptr;
+ }
+ const unsigned char *start_value = end_key;
+ const unsigned char *end_value =
+ skip_next_message(start_value, bytes.end);
+ if (!end_value) {
+ return nullptr;
+ }
+ cb_map_elements({start_key, end_key}, {start_value, end_value});
+ bytes.start = end_value;
+ }
+ return bytes.start;
+ }
+};
+
+typedef enum : uint8_t {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
+#include "msgpack.def"
+#undef X
+} type;
+
+[[noreturn]] void internal_error();
+type parse_type(unsigned char x);
+unsigned bytes_used_fixed(type ty);
+
+typedef uint64_t (*payload_info_t)(const unsigned char *);
+payload_info_t payload_info(msgpack::type ty);
+
+template <typename T, typename R> R bitcast(T x);
+
+template <typename F, msgpack::type ty>
+const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
+ const unsigned char *start = bytes.start;
+ const unsigned char *end = bytes.end;
+ const uint64_t available = end - start;
+ assert(available != 0);
+ assert(ty == parse_type(*start));
+
+ const uint64_t bytes_used = bytes_used_fixed(ty);
+ if (available < bytes_used) {
+ return 0;
+ }
+ const uint64_t available_post_header = available - bytes_used;
+
+ const payload_info_t info = payload_info(ty);
+ const uint64_t N = info(start);
+
+ switch (ty) {
+ case msgpack::t:
+ case msgpack::f: {
+ // t is 0b11000010, f is 0b11000011, masked with 0x1
+ f.cb_boolean(N);
+ return start + bytes_used;
+ }
+
+ case msgpack::posfixint:
+ case msgpack::uint8:
+ case msgpack::uint16:
+ case msgpack::uint32:
+ case msgpack::uint64: {
+ f.cb_unsigned(N);
+ return start + bytes_used;
+ }
+
+ case msgpack::negfixint:
+ case msgpack::int8:
+ case msgpack::int16:
+ case msgpack::int32:
+ case msgpack::int64: {
+ f.cb_signed(bitcast<uint64_t, int64_t>(N));
+ return start + bytes_used;
+ }
+
+ case msgpack::fixstr:
+ case msgpack::str8:
+ case msgpack::str16:
+ case msgpack::str32: {
+ if (available_post_header < N) {
+ return 0;
+ } else {
+ f.cb_string(N, start + bytes_used);
+ return start + bytes_used + N;
+ }
+ }
+
+ case msgpack::fixarray:
+ case msgpack::array16:
+ case msgpack::array32: {
+ return f.cb_array(N, {start + bytes_used, end});
+ }
+
+ case msgpack::fixmap:
+ case msgpack::map16:
+ case msgpack::map32: {
+ return f.cb_map(N, {start + bytes_used, end});
+ }
+
+ case msgpack::nil:
+ case msgpack::bin8:
+ case msgpack::bin16:
+ case msgpack::bin32:
+ case msgpack::float32:
+ case msgpack::float64:
+ case msgpack::ext8:
+ case msgpack::ext16:
+ case msgpack::ext32:
+ case msgpack::fixext1:
+ case msgpack::fixext2:
+ case msgpack::fixext4:
+ case msgpack::fixext8:
+ case msgpack::fixext16:
+ case msgpack::never_used: {
+ if (available_post_header < N) {
+ return 0;
+ }
+ return start + bytes_used + N;
+ }
+ }
+ internal_error();
+}
+
+template <typename F>
+const unsigned char *handle_msgpack(byte_range bytes, F f) {
+ const unsigned char *start = bytes.start;
+ const unsigned char *end = bytes.end;
+ const uint64_t available = end - start;
+ if (available == 0) {
+ return 0;
+ }
+ const type ty = parse_type(*start);
+
+ switch (ty) {
+#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
+ case msgpack::NAME: \
+ return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
+#include "msgpack.def"
+#undef X
+ }
+
+ internal_error();
+}
+
+bool message_is_string(byte_range bytes, const char *str);
+
+template <typename C> void foronly_string(byte_range bytes, C callback) {
+ struct inner : functors_defaults<inner> {
+ inner(C &cb) : cb(cb) {}
+ C &cb;
+ void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
+ };
+ handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
+ struct inner : functors_defaults<inner> {
+ inner(C &cb) : cb(cb) {}
+ C &cb;
+ void handle_unsigned(uint64_t x) { cb(x); }
+ };
+ handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foreach_array(byte_range bytes, C callback) {
+ struct inner : functors_defaults<inner> {
+ inner(C &cb) : cb(cb) {}
+ C &cb;
+ void handle_array_elements(byte_range element) { cb(element); }
+ };
+ handle_msgpack<inner>(bytes, {callback});
+}
+
+template <typename C> void foreach_map(byte_range bytes, C callback) {
+ struct inner : functors_defaults<inner> {
+ inner(C &cb) : cb(cb) {}
+ C &cb;
+ void handle_map_elements(byte_range key, byte_range value) {
+ cb(key, value);
+ }
+ };
+ handle_msgpack<inner>(bytes, {callback});
+}
+
+// Crude approximation to json
+void dump(byte_range);
+
+} // namespace msgpack
+
+#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
new file mode 100644
index 000000000000..8863c383e500
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
@@ -0,0 +1,108 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#ifndef SRC_RUNTIME_INCLUDE_RT_H_
+#define SRC_RUNTIME_INCLUDE_RT_H_
+
+#include "atmi_runtime.h"
+#include "hsa.h"
+#include <cstdarg>
+#include <string>
+
+namespace core {
+
+#define DEFAULT_MAX_QUEUE_SIZE 4096
+#define DEFAULT_MAX_KERNEL_TYPES 32
+#define DEFAULT_NUM_GPU_QUEUES -1 // computed in code
+#define DEFAULT_NUM_CPU_QUEUES -1 // computed in code
+#define DEFAULT_DEBUG_MODE 0
+class Environment {
+public:
+ Environment()
+ : max_queue_size_(DEFAULT_MAX_QUEUE_SIZE),
+ max_kernel_types_(DEFAULT_MAX_KERNEL_TYPES),
+ num_gpu_queues_(DEFAULT_NUM_GPU_QUEUES),
+ num_cpu_queues_(DEFAULT_NUM_CPU_QUEUES),
+ debug_mode_(DEFAULT_DEBUG_MODE) {
+ GetEnvAll();
+ }
+
+ ~Environment() {}
+
+ void GetEnvAll();
+
+ int getMaxQueueSize() const { return max_queue_size_; }
+ int getMaxKernelTypes() const { return max_kernel_types_; }
+ int getNumGPUQueues() const { return num_gpu_queues_; }
+ int getNumCPUQueues() const { return num_cpu_queues_; }
+ // TODO(ashwinma): int may change to enum if we have more debug modes
+ int getDebugMode() const { return debug_mode_; }
+ // TODO(ashwinma): int may change to enum if we have more profile modes
+
+private:
+ std::string GetEnv(const char *name) {
+ char *env = getenv(name);
+ std::string ret;
+ if (env) {
+ ret = env;
+ }
+ return ret;
+ }
+
+ int max_queue_size_;
+ int max_kernel_types_;
+ int num_gpu_queues_;
+ int num_cpu_queues_;
+ int debug_mode_;
+};
+
+class Runtime final {
+public:
+ static Runtime &getInstance() {
+ static Runtime instance;
+ return instance;
+ }
+
+ // init/finalize
+ static atmi_status_t Initialize();
+ static atmi_status_t Finalize();
+
+ // modules
+ static atmi_status_t RegisterModuleFromMemory(
+ void *, size_t, atmi_place_t,
+ atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+ void *cb_state),
+ void *cb_state);
+
+ // machine info
+ static atmi_machine_t *GetMachineInfo();
+
+ // data
+ static atmi_status_t Memcpy(void *, const void *, size_t);
+ static atmi_status_t Memfree(void *);
+ static atmi_status_t Malloc(void **, size_t, atmi_mem_place_t);
+
+ // environment variables
+ int getMaxQueueSize() const { return env_.getMaxQueueSize(); }
+ int getMaxKernelTypes() const { return env_.getMaxKernelTypes(); }
+ int getNumGPUQueues() const { return env_.getNumGPUQueues(); }
+ int getNumCPUQueues() const { return env_.getNumCPUQueues(); }
+ // TODO(ashwinma): int may change to enum if we have more debug modes
+ int getDebugMode() const { return env_.getDebugMode(); }
+
+protected:
+ Runtime() = default;
+ ~Runtime() = default;
+ Runtime(const Runtime &) = delete;
+ Runtime &operator=(const Runtime &) = delete;
+
+protected:
+ // variable to track environment variables
+ Environment env_;
+};
+
+} // namespace core
+
+#endif // SRC_RUNTIME_INCLUDE_RT_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
new file mode 100644
index 000000000000..2c31aafc7624
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -0,0 +1,1121 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include <gelf.h>
+#include <libelf.h>
+
+#include <cassert>
+#include <cstdarg>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <set>
+#include <string>
+
+#include "internal.h"
+#include "machine.h"
+#include "rt.h"
+
+#include "msgpack.h"
+
+#define msgpackErrorCheck(msg, status) \
+ if (status != 0) { \
+ printf("[%s:%d] %s failed\n", __FILE__, __LINE__, #msg); \
+ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; \
+ } else { \
+ }
+
+typedef unsigned char *address;
+/*
+ * Note descriptors.
+ */
+typedef struct {
+ uint32_t n_namesz; /* Length of note's name. */
+ uint32_t n_descsz; /* Length of note's value. */
+ uint32_t n_type; /* Type of note. */
+ // then name
+ // then padding, optional
+ // then desc, at 4 byte alignment (not 8, despite being elf64)
+} Elf_Note;
+
+// The following include file and following structs/enums
+// have been replicated on a per-use basis below. For example,
+// llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
+// but we may care only about kernargSegmentSize_ for now, so
+// we just include that field in our KernelMD implementation. We
+// chose this approach to replicate in order to avoid forcing
+// a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
+// #include "llvm/Support/AMDGPUMetadata.h"
+// typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
+// typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
+// typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
+// using llvm::AMDGPU::HSAMD::AccessQualifier;
+// using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
+// using llvm::AMDGPU::HSAMD::ValueKind;
+// using llvm::AMDGPU::HSAMD::ValueType;
+
+class KernelArgMD {
+public:
+ enum class ValueKind {
+ HiddenGlobalOffsetX,
+ HiddenGlobalOffsetY,
+ HiddenGlobalOffsetZ,
+ HiddenNone,
+ HiddenPrintfBuffer,
+ HiddenDefaultQueue,
+ HiddenCompletionAction,
+ HiddenMultiGridSyncArg,
+ HiddenHostcallBuffer,
+ Unknown
+ };
+
+ KernelArgMD()
+ : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
+ align_(0), valueKind_(ValueKind::Unknown) {}
+
+ // fields
+ std::string name_;
+ std::string typeName_;
+ uint32_t size_;
+ uint32_t offset_;
+ uint32_t align_;
+ ValueKind valueKind_;
+};
+
+class KernelMD {
+public:
+ KernelMD() : kernargSegmentSize_(0ull) {}
+
+ // fields
+ uint64_t kernargSegmentSize_;
+};
+
+static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
+ // Including only those fields that are relevant to the runtime.
+ // {"ByValue", KernelArgMD::ValueKind::ByValue},
+ // {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
+ // {"DynamicSharedPointer",
+ // KernelArgMD::ValueKind::DynamicSharedPointer},
+ // {"Sampler", KernelArgMD::ValueKind::Sampler},
+ // {"Image", KernelArgMD::ValueKind::Image},
+ // {"Pipe", KernelArgMD::ValueKind::Pipe},
+ // {"Queue", KernelArgMD::ValueKind::Queue},
+ {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
+ {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
+ {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
+ {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
+ {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
+ {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
+ {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
+ {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
+ {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
+ // v3
+ // {"by_value", KernelArgMD::ValueKind::ByValue},
+ // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
+ // {"dynamic_shared_pointer",
+ // KernelArgMD::ValueKind::DynamicSharedPointer},
+ // {"sampler", KernelArgMD::ValueKind::Sampler},
+ // {"image", KernelArgMD::ValueKind::Image},
+ // {"pipe", KernelArgMD::ValueKind::Pipe},
+ // {"queue", KernelArgMD::ValueKind::Queue},
+ {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
+ {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
+ {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
+ {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
+ {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
+ {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
+ {"hidden_completion_action",
+ KernelArgMD::ValueKind::HiddenCompletionAction},
+ {"hidden_multigrid_sync_arg",
+ KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
+ {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
+};
+
+// public variables -- TODO(ashwinma) move these to a runtime object?
+atmi_machine_t g_atmi_machine;
+ATLMachine g_atl_machine;
+
+hsa_region_t atl_gpu_kernarg_region;
+std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
+hsa_region_t atl_cpu_kernarg_region;
+
+static std::vector<hsa_executable_t> g_executables;
+
+std::map<std::string, std::string> KernelNameMap;
+std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
+std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
+
+bool g_atmi_initialized = false;
+bool g_atmi_hostcall_required = false;
+
+struct timespec context_init_time;
+int context_init_time_init = 0;
+
+/*
+ atlc is all internal global values.
+ The structure atl_context_t is defined in atl_internal.h
+ Most references will use the global structure prefix atlc.
+ However the pointer value atlc_p-> is equivalent to atlc.
+
+*/
+
+atl_context_t atlc = {.struct_initialized = false};
+atl_context_t *atlc_p = NULL;
+
+hsa_signal_t IdentityCopySignal;
+
+namespace core {
+/* Machine Info */
+atmi_machine_t *Runtime::GetMachineInfo() {
+ if (!atlc.g_hsa_initialized)
+ return NULL;
+ return &g_atmi_machine;
+}
+
+void atl_set_atmi_initialized() {
+ // FIXME: thread safe? locks?
+ g_atmi_initialized = true;
+}
+
+void atl_reset_atmi_initialized() {
+ // FIXME: thread safe? locks?
+ g_atmi_initialized = false;
+}
+
+bool atl_is_atmi_initialized() { return g_atmi_initialized; }
+
+void allow_access_to_all_gpu_agents(void *ptr) {
+ hsa_status_t err;
+ std::vector<ATLGPUProcessor> &gpu_procs =
+ g_atl_machine.processors<ATLGPUProcessor>();
+ std::vector<hsa_agent_t> agents;
+ for (uint32_t i = 0; i < gpu_procs.size(); i++) {
+ agents.push_back(gpu_procs[i].agent());
+ }
+ err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
+ ErrorCheck(Allow agents ptr access, err);
+}
+
+atmi_status_t Runtime::Initialize() {
+ atmi_devtype_t devtype = ATMI_DEVTYPE_GPU;
+ if (atl_is_atmi_initialized())
+ return ATMI_STATUS_SUCCESS;
+
+ if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) {
+ ATMIErrorCheck(GPU context init, atl_init_gpu_context());
+ }
+
+ atl_set_atmi_initialized();
+ return ATMI_STATUS_SUCCESS;
+}
+
+atmi_status_t Runtime::Finalize() {
+ // TODO(ashwinma): Finalize all processors, queues, signals, kernarg memory
+ // regions
+ hsa_status_t err;
+
+ for (uint32_t i = 0; i < g_executables.size(); i++) {
+ err = hsa_executable_destroy(g_executables[i]);
+ ErrorCheck(Destroying executable, err);
+ }
+
+ // Finalize queues
+ for (auto &p : g_atl_machine.processors<ATLCPUProcessor>()) {
+ p.destroyQueues();
+ }
+ for (auto &p : g_atl_machine.processors<ATLGPUProcessor>()) {
+ p.destroyQueues();
+ }
+
+ for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
+ SymbolInfoTable[i].clear();
+ }
+ SymbolInfoTable.clear();
+ for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
+ KernelInfoTable[i].clear();
+ }
+ KernelInfoTable.clear();
+
+ atl_reset_atmi_initialized();
+ err = hsa_shut_down();
+ ErrorCheck(Shutting down HSA, err);
+
+ return ATMI_STATUS_SUCCESS;
+}
+
+void atmi_init_context_structs() {
+ atlc_p = &atlc;
+ atlc.struct_initialized = true; /* This only gets called one time */
+ atlc.g_hsa_initialized = false;
+ atlc.g_gpu_initialized = false;
+ atlc.g_tasks_initialized = false;
+}
+
+// Implement memory_pool iteration function
+static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
+ void *data) {
+ ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
+ hsa_status_t err = HSA_STATUS_SUCCESS;
+ // Check if the memory_pool is allowed to allocate, i.e. do not return group
+ // memory
+ bool alloc_allowed = false;
+ err = hsa_amd_memory_pool_get_info(
+ memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+ &alloc_allowed);
+ ErrorCheck(Alloc allowed in memory pool check, err);
+ if (alloc_allowed) {
+ uint32_t global_flag = 0;
+ err = hsa_amd_memory_pool_get_info(
+ memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
+ ErrorCheck(Get memory pool info, err);
+ if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
+ ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
+ proc->addMemory(new_mem);
+ if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
+ DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
+ atl_gpu_kernarg_pools.push_back(memory_pool);
+ }
+ } else {
+ ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
+ proc->addMemory(new_mem);
+ }
+ }
+
+ return err;
+}
+
+static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
+ hsa_status_t err = HSA_STATUS_SUCCESS;
+ hsa_device_type_t device_type;
+ err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+ ErrorCheck(Get device type info, err);
+ switch (device_type) {
+ case HSA_DEVICE_TYPE_CPU: {
+ ;
+ ATLCPUProcessor new_proc(agent);
+ err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
+ &new_proc);
+ ErrorCheck(Iterate all memory pools, err);
+ g_atl_machine.addProcessor(new_proc);
+ } break;
+ case HSA_DEVICE_TYPE_GPU: {
+ ;
+ hsa_profile_t profile;
+ err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
+ ErrorCheck(Query the agent profile, err);
+ atmi_devtype_t gpu_type;
+ gpu_type =
+ (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
+ ATLGPUProcessor new_proc(agent, gpu_type);
+ err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
+ &new_proc);
+ ErrorCheck(Iterate all memory pools, err);
+ g_atl_machine.addProcessor(new_proc);
+ } break;
+ case HSA_DEVICE_TYPE_DSP: {
+ err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+ } break;
+ }
+
+ return err;
+}
+
+hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
+ hsa_region_segment_t segment;
+ hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
+ if (segment != HSA_REGION_SEGMENT_GLOBAL) {
+ return HSA_STATUS_SUCCESS;
+ }
+ hsa_region_global_flag_t flags;
+ hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+ if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
+ hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
+ *ret = region;
+ return HSA_STATUS_INFO_BREAK;
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+/* Determines if a memory region can be used for kernarg allocations. */
+static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
+ hsa_region_segment_t segment;
+ hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
+ if (HSA_REGION_SEGMENT_GLOBAL != segment) {
+ return HSA_STATUS_SUCCESS;
+ }
+
+ hsa_region_global_flag_t flags;
+ hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+ if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
+ hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
+ *ret = region;
+ return HSA_STATUS_INFO_BREAK;
+ }
+
+ return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t init_compute_and_memory() {
+ hsa_status_t err;
+
+ /* Iterate over the agents and pick the gpu agent */
+ err = hsa_iterate_agents(get_agent_info, NULL);
+ if (err == HSA_STATUS_INFO_BREAK) {
+ err = HSA_STATUS_SUCCESS;
+ }
+ ErrorCheck(Getting a gpu agent, err);
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+
+ /* Init all devices or individual device types? */
+ std::vector<ATLCPUProcessor> &cpu_procs =
+ g_atl_machine.processors<ATLCPUProcessor>();
+ std::vector<ATLGPUProcessor> &gpu_procs =
+ g_atl_machine.processors<ATLGPUProcessor>();
+ /* For CPU memory pools, add other devices that can access them directly
+ * or indirectly */
+ for (auto &cpu_proc : cpu_procs) {
+ for (auto &cpu_mem : cpu_proc.memories()) {
+ hsa_amd_memory_pool_t pool = cpu_mem.memory();
+ for (auto &gpu_proc : gpu_procs) {
+ hsa_agent_t agent = gpu_proc.agent();
+ hsa_amd_memory_pool_access_t access;
+ hsa_amd_agent_memory_pool_get_info(
+ agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
+ if (access != 0) {
+ // this means not NEVER, but could be YES or NO
+ // add this memory pool to the proc
+ gpu_proc.addMemory(cpu_mem);
+ }
+ }
+ }
+ }
+
+ /* FIXME: are the below combinations of procs and memory pools needed?
+ * all to all compare procs with their memory pools and add those memory
+ * pools that are accessible by the target procs */
+ for (auto &gpu_proc : gpu_procs) {
+ for (auto &gpu_mem : gpu_proc.memories()) {
+ hsa_amd_memory_pool_t pool = gpu_mem.memory();
+ for (auto &cpu_proc : cpu_procs) {
+ hsa_agent_t agent = cpu_proc.agent();
+ hsa_amd_memory_pool_access_t access;
+ hsa_amd_agent_memory_pool_get_info(
+ agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
+ if (access != 0) {
+ // this means not NEVER, but could be YES or NO
+ // add this memory pool to the proc
+ cpu_proc.addMemory(gpu_mem);
+ }
+ }
+ }
+ }
+
+ g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size();
+ g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size();
+
+ size_t num_procs = cpu_procs.size() + gpu_procs.size();
+ // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs *
+ // sizeof(atmi_device_t));
+ atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>(
+ malloc(num_procs * sizeof(atmi_device_t)));
+ int num_iGPUs = 0;
+ int num_dGPUs = 0;
+ for (uint32_t i = 0; i < gpu_procs.size(); i++) {
+ if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
+ num_iGPUs++;
+ else
+ num_dGPUs++;
+ }
+ assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
+ "Number of dGPUs and iGPUs do not add up");
+ DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
+ DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
+ DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
+ DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
+
+ g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs;
+ g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs;
+
+ int cpus_begin = 0;
+ int cpus_end = cpu_procs.size();
+ int gpus_begin = cpu_procs.size();
+ int gpus_end = cpu_procs.size() + gpu_procs.size();
+ g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin];
+ g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin];
+ g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin];
+ g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin];
+ int proc_index = 0;
+ for (int i = cpus_begin; i < cpus_end; i++) {
+ all_devices[i].type = cpu_procs[proc_index].type();
+ all_devices[i].core_count = cpu_procs[proc_index].num_cus();
+
+ std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
+ int fine_memories_size = 0;
+ int coarse_memories_size = 0;
+ DEBUG_PRINT("CPU memory types:\t");
+ for (auto &memory : memories) {
+ atmi_memtype_t type = memory.type();
+ if (type == ATMI_MEMTYPE_FINE_GRAINED) {
+ fine_memories_size++;
+ DEBUG_PRINT("Fine\t");
+ } else {
+ coarse_memories_size++;
+ DEBUG_PRINT("Coarse\t");
+ }
+ }
+ DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
+ DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
+ all_devices[i].memory_count = memories.size();
+ proc_index++;
+ }
+ proc_index = 0;
+ for (int i = gpus_begin; i < gpus_end; i++) {
+ all_devices[i].type = gpu_procs[proc_index].type();
+ all_devices[i].core_count = gpu_procs[proc_index].num_cus();
+
+ std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
+ int fine_memories_size = 0;
+ int coarse_memories_size = 0;
+ DEBUG_PRINT("GPU memory types:\t");
+ for (auto &memory : memories) {
+ atmi_memtype_t type = memory.type();
+ if (type == ATMI_MEMTYPE_FINE_GRAINED) {
+ fine_memories_size++;
+ DEBUG_PRINT("Fine\t");
+ } else {
+ coarse_memories_size++;
+ DEBUG_PRINT("Coarse\t");
+ }
+ }
+ DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
+ DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
+ all_devices[i].memory_count = memories.size();
+ proc_index++;
+ }
+ proc_index = 0;
+ atl_cpu_kernarg_region.handle = (uint64_t)-1;
+ if (cpu_procs.size() > 0) {
+ err = hsa_agent_iterate_regions(
+ cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
+ if (err == HSA_STATUS_INFO_BREAK) {
+ err = HSA_STATUS_SUCCESS;
+ }
+ err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
+ : HSA_STATUS_SUCCESS;
+ ErrorCheck(Finding a CPU kernarg memory region handle, err);
+ }
+ /* Find a memory region that supports kernel arguments. */
+ atl_gpu_kernarg_region.handle = (uint64_t)-1;
+ if (gpu_procs.size() > 0) {
+ hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
+ &atl_gpu_kernarg_region);
+ err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
+ : HSA_STATUS_SUCCESS;
+ ErrorCheck(Finding a kernarg memory region, err);
+ }
+ if (num_procs > 0)
+ return HSA_STATUS_SUCCESS;
+ else
+ return HSA_STATUS_ERROR_NOT_INITIALIZED;
+}
+
+hsa_status_t init_hsa() {
+ if (atlc.g_hsa_initialized == false) {
+ DEBUG_PRINT("Initializing HSA...");
+ hsa_status_t err = hsa_init();
+ ErrorCheck(Initializing the hsa runtime, err);
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+
+ err = init_compute_and_memory();
+ if (err != HSA_STATUS_SUCCESS)
+ return err;
+ ErrorCheck(After initializing compute and memory, err);
+
+ int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
+ KernelInfoTable.resize(gpu_count);
+ SymbolInfoTable.resize(gpu_count);
+ for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
+ SymbolInfoTable[i].clear();
+ for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
+ KernelInfoTable[i].clear();
+ atlc.g_hsa_initialized = true;
+ DEBUG_PRINT("done\n");
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+void init_tasks() {
+ if (atlc.g_tasks_initialized != false)
+ return;
+ hsa_status_t err;
+ int task_num;
+ std::vector<hsa_agent_t> gpu_agents;
+ int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
+ for (int gpu = 0; gpu < gpu_count; gpu++) {
+ atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
+ ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
+ gpu_agents.push_back(proc.agent());
+ }
+ err = hsa_signal_create(0, 0, NULL, &IdentityCopySignal);
+ ErrorCheck(Creating a HSA signal, err);
+ atlc.g_tasks_initialized = true;
+}
+
+hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
+#if (ROCM_VERSION_MAJOR >= 3) || \
+ (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
+ if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
+#else
+ if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
+#endif
+ hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
+ // memory_fault.agent
+ // memory_fault.virtual_address
+ // memory_fault.fault_reason_mask
+ // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
+ std::stringstream stream;
+ stream << std::hex << (uintptr_t)memory_fault.virtual_address;
+ std::string addr("0x" + stream.str());
+
+ std::string err_string = "[GPU Memory Error] Addr: " + addr;
+ err_string += " Reason: ";
+ if (!(memory_fault.fault_reason_mask & 0x00111111)) {
+ err_string += "No Idea! ";
+ } else {
+ if (memory_fault.fault_reason_mask & 0x00000001)
+ err_string += "Page not present or supervisor privilege. ";
+ if (memory_fault.fault_reason_mask & 0x00000010)
+ err_string += "Write access to a read-only page. ";
+ if (memory_fault.fault_reason_mask & 0x00000100)
+ err_string += "Execute access to a page marked NX. ";
+ if (memory_fault.fault_reason_mask & 0x00001000)
+ err_string += "Host access only. ";
+ if (memory_fault.fault_reason_mask & 0x00010000)
+ err_string += "ECC failure (if supported by HW). ";
+ if (memory_fault.fault_reason_mask & 0x00100000)
+ err_string += "Can't determine the exact fault address. ";
+ }
+ fprintf(stderr, "%s\n", err_string.c_str());
+ return HSA_STATUS_ERROR;
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+atmi_status_t atl_init_gpu_context() {
+ if (atlc.struct_initialized == false)
+ atmi_init_context_structs();
+ if (atlc.g_gpu_initialized != false)
+ return ATMI_STATUS_SUCCESS;
+
+ hsa_status_t err;
+ err = init_hsa();
+ if (err != HSA_STATUS_SUCCESS)
+ return ATMI_STATUS_ERROR;
+
+ int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
+ for (int gpu = 0; gpu < gpu_count; gpu++) {
+ atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
+ ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
+ int num_gpu_queues = core::Runtime::getInstance().getNumGPUQueues();
+ if (num_gpu_queues == -1) {
+ num_gpu_queues = proc.num_cus();
+ num_gpu_queues = (num_gpu_queues > 8) ? 8 : num_gpu_queues;
+ }
+ proc.createQueues(num_gpu_queues);
+ }
+
+ if (context_init_time_init == 0) {
+ clock_gettime(CLOCK_MONOTONIC_RAW, &context_init_time);
+ context_init_time_init = 1;
+ }
+
+ err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
+ ErrorCheck(Registering the system for memory faults, err);
+
+ init_tasks();
+ atlc.g_gpu_initialized = true;
+ return ATMI_STATUS_SUCCESS;
+}
+
+bool isImplicit(KernelArgMD::ValueKind value_kind) {
+ switch (value_kind) {
+ case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
+ case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
+ case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
+ case KernelArgMD::ValueKind::HiddenNone:
+ case KernelArgMD::ValueKind::HiddenPrintfBuffer:
+ case KernelArgMD::ValueKind::HiddenDefaultQueue:
+ case KernelArgMD::ValueKind::HiddenCompletionAction:
+ case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
+ case KernelArgMD::ValueKind::HiddenHostcallBuffer:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static std::pair<unsigned char *, unsigned char *>
+find_metadata(void *binary, size_t binSize) {
+ std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
+
+ Elf *e = elf_memory(static_cast<char *>(binary), binSize);
+ if (elf_kind(e) != ELF_K_ELF) {
+ return failure;
+ }
+
+ size_t numpHdrs;
+ if (elf_getphdrnum(e, &numpHdrs) != 0) {
+ return failure;
+ }
+
+ for (size_t i = 0; i < numpHdrs; ++i) {
+ GElf_Phdr pHdr;
+ if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
+ continue;
+ }
+ // Look for the runtime metadata note
+ if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
+ // Iterate over the notes in this segment
+ address ptr = (address)binary + pHdr.p_offset;
+ address segmentEnd = ptr + pHdr.p_filesz;
+
+ while (ptr < segmentEnd) {
+ Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
+ address name = (address)¬e[1];
+
+ if (note->n_type == 7 || note->n_type == 8) {
+ return failure;
+ } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
+ note->n_namesz == sizeof "AMD" &&
+ !memcmp(name, "AMD", note->n_namesz)) {
+ // code object v2 uses yaml metadata, no longer supported
+ return failure;
+ } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
+ note->n_namesz == sizeof "AMDGPU" &&
+ !memcmp(name, "AMDGPU", note->n_namesz)) {
+
+ // n_descsz = 485
+ // value is padded to 4 byte alignment, may want to move end up to
+ // match
+ size_t offset = sizeof(uint32_t) * 3 /* fields */
+ + sizeof("AMDGPU") /* name */
+ + 1 /* padding to 4 byte alignment */;
+
+ // Including the trailing padding means both pointers are 4 bytes
+ // aligned, which may be useful later.
+ unsigned char *metadata_start = (unsigned char *)ptr + offset;
+ unsigned char *metadata_end =
+ metadata_start + core::alignUp(note->n_descsz, 4);
+ return {metadata_start, metadata_end};
+ }
+ ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
+ core::alignUp(note->n_descsz, sizeof(int));
+ }
+ }
+ }
+
+ return failure;
+}
+
+namespace {
+int map_lookup_array(msgpack::byte_range message, const char *needle,
+ msgpack::byte_range *res, uint64_t *size) {
+ unsigned count = 0;
+ struct s : msgpack::functors_defaults<s> {
+ s(unsigned &count, uint64_t *size) : count(count), size(size) {}
+ unsigned &count;
+ uint64_t *size;
+ const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
+ count++;
+ *size = N;
+ return bytes.end;
+ }
+ };
+
+ msgpack::foreach_map(message,
+ [&](msgpack::byte_range key, msgpack::byte_range value) {
+ if (msgpack::message_is_string(key, needle)) {
+ // If the message is an array, record number of
+ // elements in *size
+ msgpack::handle_msgpack<s>(value, {count, size});
+ // return the whole array
+ *res = value;
+ }
+ });
+ // Only claim success if exactly one key/array pair matched
+ return count != 1;
+}
+
+int map_lookup_string(msgpack::byte_range message, const char *needle,
+ std::string *res) {
+ unsigned count = 0;
+ struct s : public msgpack::functors_defaults<s> {
+ s(unsigned &count, std::string *res) : count(count), res(res) {}
+ unsigned &count;
+ std::string *res;
+ void handle_string(size_t N, const unsigned char *str) {
+ count++;
+ *res = std::string(str, str + N);
+ }
+ };
+ msgpack::foreach_map(message,
+ [&](msgpack::byte_range key, msgpack::byte_range value) {
+ if (msgpack::message_is_string(key, needle)) {
+ msgpack::handle_msgpack<s>(value, {count, res});
+ }
+ });
+ return count != 1;
+}
+
+int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
+ uint64_t *res) {
+ unsigned count = 0;
+ msgpack::foreach_map(message,
+ [&](msgpack::byte_range key, msgpack::byte_range value) {
+ if (msgpack::message_is_string(key, needle)) {
+ msgpack::foronly_unsigned(value, [&](uint64_t x) {
+ count++;
+ *res = x;
+ });
+ }
+ });
+ return count != 1;
+}
+
+int array_lookup_element(msgpack::byte_range message, uint64_t elt,
+ msgpack::byte_range *res) {
+ int rc = 1;
+ uint64_t i = 0;
+ msgpack::foreach_array(message, [&](msgpack::byte_range value) {
+ if (i == elt) {
+ *res = value;
+ rc = 0;
+ }
+ i++;
+ });
+ return rc;
+}
+
+int populate_kernelArgMD(msgpack::byte_range args_element,
+ KernelArgMD *kernelarg) {
+ using namespace msgpack;
+ int error = 0;
+ foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
+ if (message_is_string(key, ".name")) {
+ foronly_string(value, [&](size_t N, const unsigned char *str) {
+ kernelarg->name_ = std::string(str, str + N);
+ });
+ } else if (message_is_string(key, ".type_name")) {
+ foronly_string(value, [&](size_t N, const unsigned char *str) {
+ kernelarg->typeName_ = std::string(str, str + N);
+ });
+ } else if (message_is_string(key, ".size")) {
+ foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
+ } else if (message_is_string(key, ".offset")) {
+ foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
+ } else if (message_is_string(key, ".value_kind")) {
+ foronly_string(value, [&](size_t N, const unsigned char *str) {
+ std::string s = std::string(str, str + N);
+ auto itValueKind = ArgValueKind.find(s);
+ if (itValueKind != ArgValueKind.end()) {
+ kernelarg->valueKind_ = itValueKind->second;
+ }
+ });
+ }
+ });
+ return error;
+}
+} // namespace
+
+static hsa_status_t get_code_object_custom_metadata(void *binary,
+ size_t binSize, int gpu) {
+ // parse code object with
diff erent keys from v2
+ // also, the kernel name is not the same as the symbol name -- so a
+ // symbol->name map is needed
+
+ std::pair<unsigned char *, unsigned char *> metadata =
+ find_metadata(binary, binSize);
+ if (!metadata.first) {
+ return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+ }
+
+ uint64_t kernelsSize = 0;
+ int msgpack_errors = 0;
+ msgpack::byte_range kernel_array;
+ msgpack_errors =
+ map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
+ &kernel_array, &kernelsSize);
+ msgpackErrorCheck(kernels lookup in program metadata, msgpack_errors);
+
+ for (size_t i = 0; i < kernelsSize; i++) {
+ assert(msgpack_errors == 0);
+ std::string kernelName;
+ std::string languageName;
+ std::string symbolName;
+
+ msgpack::byte_range element;
+ msgpack_errors += array_lookup_element(kernel_array, i, &element);
+ msgpackErrorCheck(element lookup in kernel metadata, msgpack_errors);
+
+ msgpack_errors += map_lookup_string(element, ".name", &kernelName);
+ msgpack_errors += map_lookup_string(element, ".language", &languageName);
+ msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
+ msgpackErrorCheck(strings lookup in kernel metadata, msgpack_errors);
+
+ atl_kernel_info_t info = {0, 0, 0, 0, 0, {}, {}, {}};
+ size_t kernel_explicit_args_size = 0;
+ uint64_t kernel_segment_size;
+ msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
+ &kernel_segment_size);
+ msgpackErrorCheck(kernarg segment size metadata lookup in kernel metadata,
+ msgpack_errors);
+
+ // create a map from symbol to name
+ DEBUG_PRINT("Kernel symbol %s; Name: %s; Size: %lu\n", symbolName.c_str(),
+ kernelName.c_str(), kernel_segment_size);
+ KernelNameMap[symbolName] = kernelName;
+
+ bool hasHiddenArgs = false;
+ if (kernel_segment_size > 0) {
+ uint64_t argsSize;
+ size_t offset = 0;
+
+ msgpack::byte_range args_array;
+ msgpack_errors +=
+ map_lookup_array(element, ".args", &args_array, &argsSize);
+ msgpackErrorCheck(kernel args metadata lookup in kernel metadata,
+ msgpack_errors);
+
+ info.num_args = argsSize;
+
+ for (size_t i = 0; i < argsSize; ++i) {
+ KernelArgMD lcArg;
+
+ msgpack::byte_range args_element;
+ msgpack_errors += array_lookup_element(args_array, i, &args_element);
+ msgpackErrorCheck(iterate args map in kernel args metadata,
+ msgpack_errors);
+
+ msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
+ msgpackErrorCheck(iterate args map in kernel args metadata,
+ msgpack_errors);
+
+ // TODO(ashwinma): should the below population actions be done only for
+ // non-implicit args?
+ // populate info with sizes and offsets
+ info.arg_sizes.push_back(lcArg.size_);
+ // v3 has offset field and not align field
+ size_t new_offset = lcArg.offset_;
+ size_t padding = new_offset - offset;
+ offset = new_offset;
+ info.arg_offsets.push_back(lcArg.offset_);
+ DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
+ lcArg.size_, lcArg.offset_);
+ offset += lcArg.size_;
+
+ // check if the arg is a hidden/implicit arg
+ // this logic assumes that all hidden args are 8-byte aligned
+ if (!isImplicit(lcArg.valueKind_)) {
+ kernel_explicit_args_size += lcArg.size_;
+ } else {
+ hasHiddenArgs = true;
+ }
+ kernel_explicit_args_size += padding;
+ }
+ }
+
+ // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
+ // in ATMI, do not count the compiler set implicit args, but set your own
+ // implicit args by discounting the compiler set implicit args
+ info.kernel_segment_size =
+ (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
+ sizeof(atmi_implicit_args_t);
+ DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
+ kernel_segment_size, info.kernel_segment_size);
+
+ // kernel received, now add it to the kernel info table
+ KernelInfoTable[gpu][kernelName] = info;
+ }
+
+ return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t populate_InfoTables(hsa_executable_t executable,
+ hsa_executable_symbol_t symbol,
+ void *data) {
+ int gpu = *static_cast<int *>(data);
+ hsa_symbol_kind_t type;
+
+ uint32_t name_length;
+ hsa_status_t err;
+ err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
+ &type);
+ ErrorCheck(Symbol info extraction, err);
+ DEBUG_PRINT("Exec Symbol type: %d\n", type);
+ if (type == HSA_SYMBOL_KIND_KERNEL) {
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
+ ErrorCheck(Symbol info extraction, err);
+ char *name = reinterpret_cast<char *>(malloc(name_length + 1));
+ err = hsa_executable_symbol_get_info(symbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
+ ErrorCheck(Symbol info extraction, err);
+ name[name_length] = 0;
+
+ if (KernelNameMap.find(std::string(name)) == KernelNameMap.end()) {
+ // did not find kernel name in the kernel map; this can happen only
+ // if the ROCr API for getting symbol info (name) is
diff erent from
+ // the comgr method of getting symbol info
+ ErrorCheck(Invalid kernel name, HSA_STATUS_ERROR_INVALID_CODE_OBJECT);
+ }
+ atl_kernel_info_t info;
+ std::string kernelName = KernelNameMap[std::string(name)];
+ // by now, the kernel info table should already have an entry
+ // because the non-ROCr custom code object parsing is called before
+ // iterating over the code object symbols using ROCr
+ if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
+ ErrorCheck(Finding the entry kernel info table,
+ HSA_STATUS_ERROR_INVALID_CODE_OBJECT);
+ }
+ // found, so assign and update
+ info = KernelInfoTable[gpu][kernelName];
+
+ /* Extract dispatch information from the symbol */
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+ &(info.kernel_object));
+ ErrorCheck(Extracting the symbol from the executable, err);
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+ &(info.group_segment_size));
+ ErrorCheck(Extracting the group segment size from the executable, err);
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+ &(info.private_segment_size));
+ ErrorCheck(Extracting the private segment from the executable, err);
+
+ DEBUG_PRINT(
+ "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
+ "kernarg\n",
+ kernelName.c_str(), info.kernel_object, info.group_segment_size,
+ info.private_segment_size, info.kernel_segment_size);
+
+ // assign it back to the kernel info table
+ KernelInfoTable[gpu][kernelName] = info;
+ free(name);
+ } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
+ ErrorCheck(Symbol info extraction, err);
+ char *name = reinterpret_cast<char *>(malloc(name_length + 1));
+ err = hsa_executable_symbol_get_info(symbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
+ ErrorCheck(Symbol info extraction, err);
+ name[name_length] = 0;
+
+ atl_symbol_info_t info;
+
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
+ ErrorCheck(Symbol info address extraction, err);
+
+ err = hsa_executable_symbol_get_info(
+ symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
+ ErrorCheck(Symbol info size extraction, err);
+
+ atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
+ DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
+ info.size);
+ register_allocation(reinterpret_cast<void *>(info.addr), (size_t)info.size,
+ place);
+ SymbolInfoTable[gpu][std::string(name)] = info;
+ if (strcmp(name, "needs_hostcall_buffer") == 0)
+ g_atmi_hostcall_required = true;
+ free(name);
+ } else {
+ DEBUG_PRINT("Symbol is an indirect function\n");
+ }
+ return HSA_STATUS_SUCCESS;
+}
+
+atmi_status_t Runtime::RegisterModuleFromMemory(
+ void *module_bytes, size_t module_size, atmi_place_t place,
+ atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+ void *cb_state),
+ void *cb_state) {
+ hsa_status_t err;
+ int gpu = place.device_id;
+ assert(gpu >= 0);
+
+ DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
+ ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
+ hsa_agent_t agent = proc.agent();
+ hsa_executable_t executable = {0};
+ hsa_profile_t agent_profile;
+
+ err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
+ ErrorCheck(Query the agent profile, err);
+ // FIXME: Assume that every profile is FULL until we understand how to build
+ // GCN with base profile
+ agent_profile = HSA_PROFILE_FULL;
+ /* Create the empty executable. */
+ err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
+ &executable);
+ ErrorCheck(Create the executable, err);
+
+ bool module_load_success = false;
+ do // Existing control flow used continue, preserve that for this patch
+ {
+ {
+ // Some metadata info is not available through ROCr API, so use custom
+ // code object metadata parsing to collect such metadata info
+
+ err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
+ ErrorCheckAndContinue(Getting custom code object metadata, err);
+
+ // Deserialize code object.
+ hsa_code_object_t code_object = {0};
+ err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
+ &code_object);
+ ErrorCheckAndContinue(Code Object Deserialization, err);
+ assert(0 != code_object.handle);
+
+ // Mutating the device image here avoids another allocation & memcpy
+ void *code_object_alloc_data =
+ reinterpret_cast<void *>(code_object.handle);
+ atmi_status_t atmi_err =
+ on_deserialized_data(code_object_alloc_data, module_size, cb_state);
+ ATMIErrorCheck(Error in deserialized_data callback, atmi_err);
+
+ /* Load the code object. */
+ err =
+ hsa_executable_load_code_object(executable, agent, code_object, NULL);
+ ErrorCheckAndContinue(Loading the code object, err);
+
+ // cannot iterate over symbols until executable is frozen
+ }
+ module_load_success = true;
+ } while (0);
+ DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
+ if (module_load_success) {
+ /* Freeze the executable; it can now be queried for symbols. */
+ err = hsa_executable_freeze(executable, "");
+ ErrorCheck(Freeze the executable, err);
+
+ err = hsa_executable_iterate_symbols(executable, populate_InfoTables,
+ static_cast<void *>(&gpu));
+ ErrorCheck(Iterating over symbols for execuatable, err);
+
+ // save the executable and destroy during finalize
+ g_executables.push_back(executable);
+ return ATMI_STATUS_SUCCESS;
+ } else {
+ return ATMI_STATUS_ERROR;
+ }
+}
+
+} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
new file mode 100644
index 000000000000..8ce6c7bd585c
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp
@@ -0,0 +1,136 @@
+/*===--------------------------------------------------------------------------
+ * ATMI (Asynchronous Task and Memory Interface)
+ *
+ * This file is distributed under the MIT License. See LICENSE.txt for details.
+ *===------------------------------------------------------------------------*/
+#include "internal.h"
+#include "rt.h"
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <iostream>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+
+/*
+ * Helper functions
+ */
+const char *get_atmi_error_string(atmi_status_t err) {
+ switch (err) {
+ case ATMI_STATUS_SUCCESS:
+ return "ATMI_STATUS_SUCCESS";
+ case ATMI_STATUS_UNKNOWN:
+ return "ATMI_STATUS_UNKNOWN";
+ case ATMI_STATUS_ERROR:
+ return "ATMI_STATUS_ERROR";
+ default:
+ return "";
+ }
+}
+
+const char *get_error_string(hsa_status_t err) {
+ switch (err) {
+ case HSA_STATUS_SUCCESS:
+ return "HSA_STATUS_SUCCESS";
+ case HSA_STATUS_INFO_BREAK:
+ return "HSA_STATUS_INFO_BREAK";
+ case HSA_STATUS_ERROR:
+ return "HSA_STATUS_ERROR";
+ case HSA_STATUS_ERROR_INVALID_ARGUMENT:
+ return "HSA_STATUS_ERROR_INVALID_ARGUMENT";
+ case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION:
+ return "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION";
+ case HSA_STATUS_ERROR_INVALID_ALLOCATION:
+ return "HSA_STATUS_ERROR_INVALID_ALLOCATION";
+ case HSA_STATUS_ERROR_INVALID_AGENT:
+ return "HSA_STATUS_ERROR_INVALID_AGENT";
+ case HSA_STATUS_ERROR_INVALID_REGION:
+ return "HSA_STATUS_ERROR_INVALID_REGION";
+ case HSA_STATUS_ERROR_INVALID_SIGNAL:
+ return "HSA_STATUS_ERROR_INVALID_SIGNAL";
+ case HSA_STATUS_ERROR_INVALID_QUEUE:
+ return "HSA_STATUS_ERROR_INVALID_QUEUE";
+ case HSA_STATUS_ERROR_OUT_OF_RESOURCES:
+ return "HSA_STATUS_ERROR_OUT_OF_RESOURCES";
+ case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT:
+ return "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT";
+ case HSA_STATUS_ERROR_RESOURCE_FREE:
+ return "HSA_STATUS_ERROR_RESOURCE_FREE";
+ case HSA_STATUS_ERROR_NOT_INITIALIZED:
+ return "HSA_STATUS_ERROR_NOT_INITIALIZED";
+ case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW:
+ return "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW";
+ case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS:
+ return "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS";
+ case HSA_STATUS_ERROR_INVALID_INDEX:
+ return "HSA_STATUS_ERROR_INVALID_INDEX";
+ case HSA_STATUS_ERROR_INVALID_ISA:
+ return "HSA_STATUS_ERROR_INVALID_ISA";
+ case HSA_STATUS_ERROR_INVALID_ISA_NAME:
+ return "HSA_STATUS_ERROR_INVALID_ISA_NAME";
+ case HSA_STATUS_ERROR_INVALID_CODE_OBJECT:
+ return "HSA_STATUS_ERROR_INVALID_CODE_OBJECT";
+ case HSA_STATUS_ERROR_INVALID_EXECUTABLE:
+ return "HSA_STATUS_ERROR_INVALID_EXECUTABLE";
+ case HSA_STATUS_ERROR_FROZEN_EXECUTABLE:
+ return "HSA_STATUS_ERROR_FROZEN_EXECUTABLE";
+ case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME:
+ return "HSA_STATUS_ERROR_INVALID_SYMBOL_NAME";
+ case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED:
+ return "HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED";
+ case HSA_STATUS_ERROR_VARIABLE_UNDEFINED:
+ return "HSA_STATUS_ERROR_VARIABLE_UNDEFINED";
+ case HSA_STATUS_ERROR_EXCEPTION:
+ return "HSA_STATUS_ERROR_EXCEPTION";
+ }
+}
+
+namespace core {
+/*
+ * Environment variables
+ */
+void Environment::GetEnvAll() {
+ std::string var = GetEnv("ATMI_HELP");
+ if (!var.empty()) {
+ std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl
+ << "ATMI_MAX_KERNEL_TYPES : positive integer" << std::endl
+ << "ATMI_DEVICE_GPU_WORKERS : positive integer" << std::endl
+ << "ATMI_DEVICE_CPU_WORKERS : positive integer" << std::endl
+ << "ATMI_DEBUG : 1 for printing out trace/debug info"
+ << std::endl;
+ exit(0);
+ }
+
+ var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE");
+ if (!var.empty())
+ max_queue_size_ = std::stoi(var);
+
+ var = GetEnv("ATMI_MAX_KERNEL_TYPES");
+ if (!var.empty())
+ max_kernel_types_ = std::stoi(var);
+
+ /* TODO: If we get a good use case for device-specific worker count, we
+ * should explore it, but let us keep the worker count uniform for all
+ * devices of a type until that time
+ */
+ var = GetEnv("ATMI_DEVICE_GPU_WORKERS");
+ if (!var.empty())
+ num_gpu_queues_ = std::stoi(var);
+
+ /* TODO: If we get a good use case for device-specific worker count, we
+ * should explore it, but let us keep the worker count uniform for all
+ * devices of a type until that time
+ */
+ var = GetEnv("ATMI_DEVICE_CPU_WORKERS");
+ if (!var.empty())
+ num_cpu_queues_ = std::stoi(var);
+
+ var = GetEnv("ATMI_DEBUG");
+ if (!var.empty())
+ debug_mode_ = std::stoi(var);
+}
+} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
new file mode 100644
index 000000000000..54d42e0436a3
--- /dev/null
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -0,0 +1,1713 @@
+//===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for hsa machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <elf.h>
+#include <ffi.h>
+#include <fstream>
+#include <iostream>
+#include <libelf.h>
+#include <list>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+// Header from ATMI interface
+#include "atmi_interop_hsa.h"
+#include "atmi_runtime.h"
+
+#include "internal.h"
+
+#include "internal.h"
+
+#include "omptargetplugin.h"
+
+// Get static gpu grid values from clang target-specific constants managed
+// in the header file llvm/Frontend/OpenMP/OMPGridValues.h
+// Copied verbatim to meet the requirement that libomptarget builds without
+// a copy of llvm checked out nearby
+namespace llvm {
+namespace omp {
+enum GVIDX {
+ /// The maximum number of workers in a kernel.
+ /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z
+ GV_Threads,
+ /// The size reserved for data in a shared memory slot.
+ GV_Slot_Size,
+ /// The default value of maximum number of threads in a worker warp.
+ GV_Warp_Size,
+ /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size
+ /// for NVPTX.
+ GV_Warp_Size_32,
+ /// The number of bits required to represent the max number of threads in warp
+ GV_Warp_Size_Log2,
+ /// GV_Warp_Size * GV_Slot_Size,
+ GV_Warp_Slot_Size,
+ /// the maximum number of teams.
+ GV_Max_Teams,
+ /// Global Memory Alignment
+ GV_Mem_Align,
+ /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
+ GV_Warp_Size_Log2_Mask,
+ // An alternative to the heavy data sharing infrastructure that uses global
+ // memory is one that uses device __shared__ memory. The amount of such space
+ // (in bytes) reserved by the OpenMP runtime is noted here.
+ GV_SimpleBufferSize,
+ // The absolute maximum team size for a working group
+ GV_Max_WG_Size,
+ // The default maximum team size for a working group
+ GV_Default_WG_Size,
+ // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.
+ GV_Max_Warp_Number,
+ /// The slot size that should be reserved for a working warp.
+ /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
+ GV_Warp_Size_Log2_MaskL
+};
+
+static constexpr unsigned AMDGPUGpuGridValues[] = {
+ 448, // GV_Threads
+ 256, // GV_Slot_Size
+ 64, // GV_Warp_Size
+ 32, // GV_Warp_Size_32
+ 6, // GV_Warp_Size_Log2
+ 64 * 256, // GV_Warp_Slot_Size
+ 128, // GV_Max_Teams
+ 256, // GV_Mem_Align
+ 63, // GV_Warp_Size_Log2_Mask
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size,
+ 256, // GV_Defaut_WG_Size
+ 1024 / 64, // GV_Max_WG_Size / GV_WarpSize
+ 63 // GV_Warp_Size_Log2_MaskL
+};
+} // namespace omp
+} // namespace llvm
+
+#ifndef TARGET_NAME
+#define TARGET_NAME AMDHSA
+#endif
+
+int print_kernel_trace;
+
+// Size of the target call stack struture
+uint32_t TgtStackItemSize = 0;
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+ do { \
+ if (DebugLevel > 0) { \
+ DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+ } \
+ } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) \
+ {}
+#endif // OMPTARGET_DEBUG
+
+#undef check // Drop definition from internal.h
+#ifdef OMPTARGET_DEBUG
+#define check(msg, status) \
+ if (status != ATMI_STATUS_SUCCESS) { \
+ /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/ \
+ DP(#msg " failed\n"); \
+ /*assert(0);*/ \
+ } else { \
+ /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg); \
+ */ \
+ DP(#msg " succeeded\n"); \
+ }
+#else
+#define check(msg, status) \
+ {}
+#endif
+
+#include "../../common/elf_common.c"
+
+static bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
+ const uint16_t amdgcnMachineID = 224;
+ int32_t r = elf_check_machine(image, amdgcnMachineID);
+ if (!r) {
+ DP("Supported machine ID not found\n");
+ }
+ return r;
+}
+
+/// Keep entries table per device
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+ std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ NONE
+};
+
+struct KernelArgPool {
+private:
+ static pthread_mutex_t mutex;
+
+public:
+ uint32_t kernarg_segment_size;
+ void *kernarg_region = nullptr;
+ std::queue<int> free_kernarg_segments;
+
+ uint32_t kernarg_size_including_implicit() {
+ return kernarg_segment_size + sizeof(atmi_implicit_args_t);
+ }
+
+ ~KernelArgPool() {
+ if (kernarg_region) {
+ auto r = hsa_amd_memory_pool_free(kernarg_region);
+ assert(r == HSA_STATUS_SUCCESS);
+ ErrorCheck(Memory pool free, r);
+ }
+ }
+
+ // Can't really copy or move a mutex
+ KernelArgPool() = default;
+ KernelArgPool(const KernelArgPool &) = delete;
+ KernelArgPool(KernelArgPool &&) = delete;
+
+ KernelArgPool(uint32_t kernarg_segment_size)
+ : kernarg_segment_size(kernarg_segment_size) {
+
+ // atmi uses one pool per kernel for all gpus, with a fixed upper size
+ // preserving that exact scheme here, including the queue<int>
+ {
+ hsa_status_t err = hsa_amd_memory_pool_allocate(
+ atl_gpu_kernarg_pools[0],
+ kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+ &kernarg_region);
+ ErrorCheck(Allocating memory for the executable-kernel, err);
+ core::allow_access_to_all_gpu_agents(kernarg_region);
+
+ for (int i = 0; i < MAX_NUM_KERNELS; i++) {
+ free_kernarg_segments.push(i);
+ }
+ }
+ }
+
+ void *allocate(uint64_t arg_num) {
+ assert((arg_num * sizeof(void *)) == kernarg_segment_size);
+ lock l(&mutex);
+ void *res = nullptr;
+ if (!free_kernarg_segments.empty()) {
+
+ int free_idx = free_kernarg_segments.front();
+ res = static_cast<void *>(static_cast<char *>(kernarg_region) +
+ (free_idx * kernarg_size_including_implicit()));
+ assert(free_idx == pointer_to_index(res));
+ free_kernarg_segments.pop();
+ }
+ return res;
+ }
+
+ void deallocate(void *ptr) {
+ lock l(&mutex);
+ int idx = pointer_to_index(ptr);
+ free_kernarg_segments.push(idx);
+ }
+
+private:
+ int pointer_to_index(void *ptr) {
+ ptr
diff _t bytes =
+ static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
+ assert(bytes >= 0);
+ assert(bytes % kernarg_size_including_implicit() == 0);
+ return bytes / kernarg_size_including_implicit();
+ }
+ struct lock {
+ lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
+ ~lock() { pthread_mutex_unlock(m); }
+ pthread_mutex_t *m;
+ };
+};
+pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
+ KernelArgPoolMap;
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+ // execution mode of kernel
+ // 0 - SPMD mode (without master warp)
+ // 1 - Generic mode (with master warp)
+ int8_t ExecutionMode;
+ int16_t ConstWGSize;
+ int8_t MaxParLevel;
+ int32_t device_id;
+ void *CallStackAddr;
+ const char *Name;
+
+ KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int8_t _MaxParLevel,
+ int32_t _device_id, void *_CallStackAddr, const char *_Name,
+ uint32_t _kernarg_segment_size)
+ : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
+ MaxParLevel(_MaxParLevel), device_id(_device_id),
+ CallStackAddr(_CallStackAddr), Name(_Name) {
+ DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
+
+ std::string N(_Name);
+ if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
+ KernelArgPoolMap.insert(
+ std::make_pair(N, std::unique_ptr<KernelArgPool>(
+ new KernelArgPool(_kernarg_segment_size))));
+ }
+ }
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+// ATMI API to get gpu and gpu memory place
+static atmi_place_t get_gpu_place(int device_id) {
+ return ATMI_PLACE_GPU(0, device_id);
+}
+static atmi_mem_place_t get_gpu_mem_place(int device_id) {
+ return ATMI_MEM_PLACE_GPU_MEM(0, device_id, 0);
+}
+
+static std::vector<hsa_agent_t> find_gpu_agents() {
+ std::vector<hsa_agent_t> res;
+
+ hsa_status_t err = hsa_iterate_agents(
+ [](hsa_agent_t agent, void *data) -> hsa_status_t {
+ std::vector<hsa_agent_t> *res =
+ static_cast<std::vector<hsa_agent_t> *>(data);
+
+ hsa_device_type_t device_type;
+ // get_info fails iff HSA runtime not yet initialized
+ hsa_status_t err =
+ hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+ if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
+ printf("rtl.cpp: err %d\n", err);
+ assert(err == HSA_STATUS_SUCCESS);
+
+ if (device_type == HSA_DEVICE_TYPE_GPU) {
+ res->push_back(agent);
+ }
+ return HSA_STATUS_SUCCESS;
+ },
+ &res);
+
+ // iterate_agents fails iff HSA runtime not yet initialized
+ if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
+ printf("rtl.cpp: err %d\n", err);
+ assert(err == HSA_STATUS_SUCCESS);
+ return res;
+}
+
+static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
+ void *data) {
+ if (status != HSA_STATUS_SUCCESS) {
+ const char *status_string;
+ if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
+ status_string = "unavailable";
+ }
+ fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__,
+ __LINE__, source, status, status_string);
+ abort();
+ }
+}
+
+namespace core {
+void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
+ __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+uint16_t create_header(hsa_packet_type_t type, int barrier,
+ atmi_task_fence_scope_t acq_fence,
+ atmi_task_fence_scope_t rel_fence) {
+ uint16_t header = type << HSA_PACKET_HEADER_TYPE;
+ header |= barrier << HSA_PACKET_HEADER_BARRIER;
+ header |= (hsa_fence_scope_t) static_cast<int>(
+ acq_fence << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
+ header |= (hsa_fence_scope_t) static_cast<int>(
+ rel_fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+ return header;
+}
+} // namespace core
+
+/// Class containing all the device information
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ int NumberOfDevices;
+
+ // GPU devices
+ std::vector<hsa_agent_t> HSAAgents;
+ std::vector<hsa_queue_t *> HSAQueues; // one per gpu
+
+ // Device properties
+ std::vector<int> ComputeUnits;
+ std::vector<int> GroupsPerDevice;
+ std::vector<int> ThreadsPerGroup;
+ std::vector<int> WarpSize;
+
+ // OpenMP properties
+ std::vector<int> NumTeams;
+ std::vector<int> NumThreads;
+
+ // OpenMP Environment properties
+ int EnvNumTeams;
+ int EnvTeamLimit;
+ int EnvMaxTeamsDefault;
+
+ // OpenMP Requires Flags
+ int64_t RequiresFlags;
+
+ // Resource pools
+ SignalPoolT FreeSignalPool;
+
+ static const int HardTeamLimit = 1 << 20; // 1 Meg
+ static const int DefaultNumTeams = 128;
+ static const int Max_Teams =
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
+ static const int Warp_Size =
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+ static const int Max_WG_Size =
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
+ static const int Default_WG_Size =
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
+
+ // Record entry point associated with device
+ void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Entries.push_back(entry);
+ }
+
+ // Return true if the entry is associated with device
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (auto &it : E.Entries) {
+ if (it.addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ int32_t size = E.Entries.size();
+
+ // Table is empty
+ if (!size)
+ return 0;
+
+ __tgt_offload_entry *begin = &E.Entries[0];
+ __tgt_offload_entry *end = &E.Entries[size - 1];
+
+ // Update table info according to the entries and return the pointer
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = ++end;
+
+ return &E.Table;
+ }
+
+ // Clear entries table for a device
+ void clearOffloadEntriesTable(int device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+ // KernelArgPoolMap.clear();
+ E.Entries.clear();
+ E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+ }
+
+ RTLDeviceInfoTy() {
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEBUG"))
+ DebugLevel = std::stoi(envStr);
+#endif // OMPTARGET_DEBUG
+
+ // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
+ // anytime. You do not need a debug library build.
+ // 0 => no tracing
+ // 1 => tracing dispatch only
+ // >1 => verbosity increase
+ if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
+ print_kernel_trace = atoi(envStr);
+ else
+ print_kernel_trace = 0;
+
+ DP("Start initializing HSA-ATMI\n");
+ atmi_status_t err = atmi_init();
+ if (err != ATMI_STATUS_SUCCESS) {
+ DP("Error when initializing HSA-ATMI\n");
+ return;
+ }
+
+ HSAAgents = find_gpu_agents();
+ NumberOfDevices = (int)HSAAgents.size();
+
+ if (NumberOfDevices == 0) {
+ DP("There are no devices supporting HSA.\n");
+ return;
+ } else {
+ DP("There are %d devices supporting HSA.\n", NumberOfDevices);
+ }
+
+ // Init the device info
+ HSAQueues.resize(NumberOfDevices);
+ FuncGblEntries.resize(NumberOfDevices);
+ ThreadsPerGroup.resize(NumberOfDevices);
+ ComputeUnits.resize(NumberOfDevices);
+ GroupsPerDevice.resize(NumberOfDevices);
+ WarpSize.resize(NumberOfDevices);
+ NumTeams.resize(NumberOfDevices);
+ NumThreads.resize(NumberOfDevices);
+
+ for (int i = 0; i < NumberOfDevices; i++) {
+ uint32_t queue_size = 0;
+ {
+ hsa_status_t err;
+ err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+ &queue_size);
+ ErrorCheck(Querying the agent maximum queue size, err);
+ if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
+ queue_size = core::Runtime::getInstance().getMaxQueueSize();
+ }
+ }
+
+ hsa_status_t rc = hsa_queue_create(
+ HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,
+ UINT32_MAX, UINT32_MAX, &HSAQueues[i]);
+ if (rc != HSA_STATUS_SUCCESS) {
+ DP("Failed to create HSA queues\n");
+ return;
+ }
+ }
+
+ for (int i = 0; i < NumberOfDevices; i++) {
+ ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
+ GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
+ ComputeUnits[i] = 1;
+ DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
+ GroupsPerDevice[i], ThreadsPerGroup[i]);
+ }
+
+ // Get environment variables regarding teams
+ char *envStr = getenv("OMP_TEAM_LIMIT");
+ if (envStr) {
+ // OMP_TEAM_LIMIT has been set
+ EnvTeamLimit = std::stoi(envStr);
+ DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+ } else {
+ EnvTeamLimit = -1;
+ }
+ envStr = getenv("OMP_NUM_TEAMS");
+ if (envStr) {
+ // OMP_NUM_TEAMS has been set
+ EnvNumTeams = std::stoi(envStr);
+ DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+ } else {
+ EnvNumTeams = -1;
+ }
+ // Get environment variables regarding expMaxTeams
+ envStr = getenv("OMP_MAX_TEAMS_DEFAULT");
+ if (envStr) {
+ EnvMaxTeamsDefault = std::stoi(envStr);
+ DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault);
+ } else {
+ EnvMaxTeamsDefault = -1;
+ }
+
+ // Default state.
+ RequiresFlags = OMP_REQ_UNDEFINED;
+ }
+
+ ~RTLDeviceInfoTy() {
+ DP("Finalizing the HSA-ATMI DeviceInfo.\n");
+ KernelArgPoolMap.clear(); // calls hsa to free memory
+ atmi_finalize();
+ }
+};
+
+pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
+
+// TODO: May need to drop the trailing to fields until deviceRTL is updated
+struct omptarget_device_environmentTy {
+ int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
+ // only useful for Debug build of deviceRTLs
+ int32_t num_devices; // gets number of active offload devices
+ int32_t device_num; // gets a value 0 to num_devices-1
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+namespace {
+
+int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr) {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+ assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+ // Return success if we are not copying back to host from target.
+ if (!HstPtr)
+ return OFFLOAD_SUCCESS;
+ atmi_status_t err;
+ DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
+ (long long unsigned)(Elf64_Addr)TgtPtr,
+ (long long unsigned)(Elf64_Addr)HstPtr);
+ err = atmi_memcpy(HstPtr, TgtPtr, (size_t)Size);
+ if (err != ATMI_STATUS_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: "
+ "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+ (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
+ return OFFLOAD_FAIL;
+ }
+ DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
+ (long long unsigned)(Elf64_Addr)TgtPtr,
+ (long long unsigned)(Elf64_Addr)HstPtr);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr) {
+ assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+ atmi_status_t err;
+ assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
+ // Return success if we are not doing host to target.
+ if (!HstPtr)
+ return OFFLOAD_SUCCESS;
+
+ DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
+ (long long unsigned)(Elf64_Addr)HstPtr,
+ (long long unsigned)(Elf64_Addr)TgtPtr);
+ err = atmi_memcpy(TgtPtr, HstPtr, (size_t)Size);
+ if (err != ATMI_STATUS_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: "
+ "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+ (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+// Async.
+// The implementation was written with cuda streams in mind. The semantics of
+// that are to execute kernels on a queue in order of insertion. A synchronise
+// call then makes writes visible between host and device. This means a series
+// of N data_submit_async calls are expected to execute serially. HSA offers
+// various options to run the data copies concurrently. This may require changes
+// to libomptarget.
+
+// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
+// there are no outstanding kernels that need to be synchronized. Any async call
+// may be passed a Queue==0, at which point the cuda implementation will set it
+// to non-null (see getStream). The cuda streams are per-device. Upstream may
+// change this interface to explicitly initialize the async_info_pointer, but
+// until then hsa lazily initializes it as well.
+
+void initAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
+ // set non-null while using async calls, return to null to indicate completion
+ assert(async_info_ptr);
+ if (!async_info_ptr->Queue) {
+ async_info_ptr->Queue = reinterpret_cast<void *>(UINT64_MAX);
+ }
+}
+void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
+ assert(async_info_ptr);
+ assert(async_info_ptr->Queue);
+ async_info_ptr->Queue = 0;
+}
+} // namespace
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+ return elf_machine_id_is_amdgcn(image);
+}
+
+int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+ DP("Init requires flags to %ld\n", RequiresFlags);
+ DeviceInfo.RequiresFlags = RequiresFlags;
+ return RequiresFlags;
+}
+
+int32_t __tgt_rtl_init_device(int device_id) {
+ hsa_status_t err;
+
+ // this is per device id init
+ DP("Initialize the device id: %d\n", device_id);
+
+ hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+
+ // Get number of Compute Unit
+ uint32_t compute_units = 0;
+ err = hsa_agent_get_info(
+ agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+ &compute_units);
+ if (err != HSA_STATUS_SUCCESS) {
+ DeviceInfo.ComputeUnits[device_id] = 1;
+ DP("Error getting compute units : settiing to 1\n");
+ } else {
+ DeviceInfo.ComputeUnits[device_id] = compute_units;
+ DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
+ }
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id,
+ DeviceInfo.ComputeUnits[device_id]);
+
+ // Query attributes to determine number of threads/block and blocks/grid.
+ uint16_t workgroup_max_dim[3];
+ err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+ &workgroup_max_dim);
+ if (err != HSA_STATUS_SUCCESS) {
+ DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ DP("Error getting grid dims: num groups : %d\n",
+ RTLDeviceInfoTy::DefaultNumTeams);
+ } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
+ DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
+ DP("Using %d ROCm blocks per grid\n",
+ DeviceInfo.GroupsPerDevice[device_id]);
+ } else {
+ DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+ DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
+ "at the hard limit\n",
+ workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
+ }
+
+ // Get thread limit
+ hsa_dim3_t grid_max_dim;
+ err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
+ if (err == HSA_STATUS_SUCCESS) {
+ DeviceInfo.ThreadsPerGroup[device_id] =
+ reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
+ DeviceInfo.GroupsPerDevice[device_id];
+ if ((DeviceInfo.ThreadsPerGroup[device_id] >
+ RTLDeviceInfoTy::Max_WG_Size) ||
+ DeviceInfo.ThreadsPerGroup[device_id] == 0) {
+ DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+ DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+ } else {
+ DP("Using ROCm Queried thread limit: %d\n",
+ DeviceInfo.ThreadsPerGroup[device_id]);
+ }
+ } else {
+ DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+ DP("Error getting max block dimension, use default:%d \n",
+ RTLDeviceInfoTy::Max_WG_Size);
+ }
+
+ // Get wavefront size
+ uint32_t wavefront_size = 0;
+ err =
+ hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
+ if (err == HSA_STATUS_SUCCESS) {
+ DP("Queried wavefront size: %d\n", wavefront_size);
+ DeviceInfo.WarpSize[device_id] = wavefront_size;
+ } else {
+ DP("Default wavefront size: %d\n",
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
+ DeviceInfo.WarpSize[device_id] =
+ llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+ }
+
+ // Adjust teams to the env variables
+ if (DeviceInfo.EnvTeamLimit > 0 &&
+ DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) {
+ DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit;
+ DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
+ DeviceInfo.EnvTeamLimit);
+ }
+
+ // Set default number of teams
+ if (DeviceInfo.EnvNumTeams > 0) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+ DP("Default number of teams set according to environment %d\n",
+ DeviceInfo.EnvNumTeams);
+ } else {
+ DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ DP("Default number of teams set according to library's default %d\n",
+ RTLDeviceInfoTy::DefaultNumTeams);
+ }
+
+ if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id];
+ DP("Default number of teams exceeds device limit, capping at %d\n",
+ DeviceInfo.GroupsPerDevice[device_id]);
+ }
+
+ // Set default number of threads
+ DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
+ DP("Default number of threads set according to library's default %d\n",
+ RTLDeviceInfoTy::Default_WG_Size);
+ if (DeviceInfo.NumThreads[device_id] >
+ DeviceInfo.ThreadsPerGroup[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id];
+ DP("Default number of threads exceeds device limit, capping at %d\n",
+ DeviceInfo.ThreadsPerGroup[device_id]);
+ }
+
+ DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
+ device_id, DeviceInfo.GroupsPerDevice[device_id],
+ DeviceInfo.ThreadsPerGroup[device_id]);
+
+ DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
+ DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
+ DeviceInfo.GroupsPerDevice[device_id],
+ DeviceInfo.GroupsPerDevice[device_id] *
+ DeviceInfo.ThreadsPerGroup[device_id]);
+
+ return OFFLOAD_SUCCESS;
+}
+
+namespace {
+Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
+ size_t N;
+ int rc = elf_getshdrnum(elf, &N);
+ if (rc != 0) {
+ return nullptr;
+ }
+
+ Elf64_Shdr *result = nullptr;
+ for (size_t i = 0; i < N; i++) {
+ Elf_Scn *scn = elf_getscn(elf, i);
+ if (scn) {
+ Elf64_Shdr *shdr = elf64_getshdr(scn);
+ if (shdr) {
+ if (shdr->sh_type == SHT_HASH) {
+ if (result == nullptr) {
+ result = shdr;
+ } else {
+ // multiple SHT_HASH sections not handled
+ return nullptr;
+ }
+ }
+ }
+ }
+ }
+ return result;
+}
+
+const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
+ const char *symname) {
+
+ assert(section_hash);
+ size_t section_symtab_index = section_hash->sh_link;
+ Elf64_Shdr *section_symtab =
+ elf64_getshdr(elf_getscn(elf, section_symtab_index));
+ size_t section_strtab_index = section_symtab->sh_link;
+
+ const Elf64_Sym *symtab =
+ reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
+
+ const uint32_t *hashtab =
+ reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
+
+ // Layout:
+ // nbucket
+ // nchain
+ // bucket[nbucket]
+ // chain[nchain]
+ uint32_t nbucket = hashtab[0];
+ const uint32_t *bucket = &hashtab[2];
+ const uint32_t *chain = &hashtab[nbucket + 2];
+
+ const size_t max = strlen(symname) + 1;
+ const uint32_t hash = elf_hash(symname);
+ for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
+ char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
+ if (strncmp(symname, n, max) == 0) {
+ return &symtab[i];
+ }
+ }
+
+ return nullptr;
+}
+
+typedef struct {
+ void *addr = nullptr;
+ uint32_t size = UINT32_MAX;
+} symbol_info;
+
+int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
+ symbol_info *res) {
+ if (elf_kind(elf) != ELF_K_ELF) {
+ return 1;
+ }
+
+ Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
+ if (!section_hash) {
+ return 1;
+ }
+
+ const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
+ if (!sym) {
+ return 1;
+ }
+
+ if (sym->st_size > UINT32_MAX) {
+ return 1;
+ }
+
+ res->size = static_cast<uint32_t>(sym->st_size);
+ res->addr = sym->st_value + base;
+ return 0;
+}
+
+int get_symbol_info_without_loading(char *base, size_t img_size,
+ const char *symname, symbol_info *res) {
+ Elf *elf = elf_memory(base, img_size);
+ if (elf) {
+ int rc = get_symbol_info_without_loading(elf, base, symname, res);
+ elf_end(elf);
+ return rc;
+ }
+ return 1;
+}
+
+atmi_status_t interop_get_symbol_info(char *base, size_t img_size,
+ const char *symname, void **var_addr,
+ uint32_t *var_size) {
+ symbol_info si;
+ int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
+ if (rc == 0) {
+ *var_addr = si.addr;
+ *var_size = si.size;
+ return ATMI_STATUS_SUCCESS;
+ } else {
+ return ATMI_STATUS_ERROR;
+ }
+}
+
+template <typename C>
+atmi_status_t module_register_from_memory_to_place(void *module_bytes,
+ size_t module_size,
+ atmi_place_t place, C cb) {
+ auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t {
+ C *unwrapped = static_cast<C *>(cb_state);
+ return (*unwrapped)(data, size);
+ };
+ return atmi_module_register_from_memory_to_place(
+ module_bytes, module_size, place, L, static_cast<void *>(&cb));
+}
+} // namespace
+
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+ static pthread_mutex_t load_binary_mutex = PTHREAD_MUTEX_INITIALIZER;
+ pthread_mutex_lock(&load_binary_mutex);
+ __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
+ pthread_mutex_unlock(&load_binary_mutex);
+ return res;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
+ __tgt_device_image *image) {
+ const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
+
+ DeviceInfo.clearOffloadEntriesTable(device_id);
+
+ // We do not need to set the ELF version because the caller of this function
+ // had to do that to decide the right runtime to use
+
+ if (!elf_machine_id_is_amdgcn(image)) {
+ return NULL;
+ }
+
+ omptarget_device_environmentTy host_device_env;
+ host_device_env.num_devices = DeviceInfo.NumberOfDevices;
+ host_device_env.device_num = device_id;
+ host_device_env.debug_level = 0;
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+ host_device_env.debug_level = std::stoi(envStr);
+ }
+#endif
+
+ auto on_deserialized_data = [&](void *data, size_t size) -> atmi_status_t {
+ const char *device_env_Name = "omptarget_device_environment";
+ symbol_info si;
+ int rc = get_symbol_info_without_loading((char *)image->ImageStart,
+ img_size, device_env_Name, &si);
+ if (rc != 0) {
+ DP("Finding global device environment '%s' - symbol missing.\n",
+ device_env_Name);
+ // no need to return FAIL, consider this is a not a device debug build.
+ return ATMI_STATUS_SUCCESS;
+ }
+ if (si.size != sizeof(host_device_env)) {
+ return ATMI_STATUS_ERROR;
+ }
+ DP("Setting global device environment %lu bytes\n", si.size);
+ uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
+ void *pos = (char *)data + offset;
+ memcpy(pos, &host_device_env, sizeof(host_device_env));
+ return ATMI_STATUS_SUCCESS;
+ };
+
+ atmi_status_t err;
+ {
+ err = module_register_from_memory_to_place(
+ (void *)image->ImageStart, img_size, get_gpu_place(device_id),
+ on_deserialized_data);
+
+ check("Module registering", err);
+ if (err != ATMI_STATUS_SUCCESS) {
+ char GPUName[64] = "--unknown gpu--";
+ hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+ (void)hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
+ (void *)GPUName);
+ fprintf(stderr,
+ "Possible gpu arch mismatch: %s, please check"
+ " compiler: -march=<gpu> flag\n",
+ GPUName);
+ return NULL;
+ }
+ }
+
+ DP("ATMI module successfully loaded!\n");
+
+ // TODO: Check with Guansong to understand the below comment more thoroughly.
+ // Here, we take advantage of the data that is appended after img_end to get
+ // the symbols' name we need to load. This data consist of the host entries
+ // begin and end as well as the target name (see the offloading linker script
+ // creation in clang compiler).
+
+ // Find the symbols in the module by name. The name can be obtain by
+ // concatenating the host entry name with the target name
+
+ __tgt_offload_entry *HostBegin = image->EntriesBegin;
+ __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+ for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+ if (!e->addr) {
+ // The host should have always something in the address to
+ // uniquely identify the target region.
+ fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n",
+ (unsigned long long)e->size);
+ return NULL;
+ }
+
+ if (e->size) {
+ __tgt_offload_entry entry = *e;
+
+ void *varptr;
+ uint32_t varsize;
+
+ err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id),
+ e->name, &varptr, &varsize);
+
+ if (err != ATMI_STATUS_SUCCESS) {
+ DP("Loading global '%s' (Failed)\n", e->name);
+ // Inform the user what symbol prevented offloading
+ fprintf(stderr, "Loading global '%s' (Failed)\n", e->name);
+ return NULL;
+ }
+
+ if (varsize != e->size) {
+ DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
+ varsize, e->size);
+ return NULL;
+ }
+
+ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+ DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
+ entry.addr = (void *)varptr;
+
+ DeviceInfo.addOffloadEntry(device_id, entry);
+
+ if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ e->flags & OMP_DECLARE_TARGET_LINK) {
+ // If unified memory is present any target link variables
+ // can access host addresses directly. There is no longer a
+ // need for device copies.
+ err = atmi_memcpy(varptr, e->addr, sizeof(void *));
+ if (err != ATMI_STATUS_SUCCESS)
+ DP("Error when copying USM\n");
+ DP("Copy linked variable host address (" DPxMOD ")"
+ "to device address (" DPxMOD ")\n",
+ DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
+ }
+
+ continue;
+ }
+
+ DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
+
+ atmi_mem_place_t place = get_gpu_mem_place(device_id);
+ uint32_t kernarg_segment_size;
+ err = atmi_interop_hsa_get_kernel_info(
+ place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+ &kernarg_segment_size);
+
+ // each arg is a void * in this openmp implementation
+ uint32_t arg_num = kernarg_segment_size / sizeof(void *);
+ std::vector<size_t> arg_sizes(arg_num);
+ for (std::vector<size_t>::iterator it = arg_sizes.begin();
+ it != arg_sizes.end(); it++) {
+ *it = sizeof(void *);
+ }
+
+ // default value GENERIC (in case symbol is missing from cubin file)
+ int8_t ExecModeVal = ExecutionModeType::GENERIC;
+
+ // get flat group size if present, else Default_WG_Size
+ int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+
+ // Max parallel level
+ int16_t MaxParLevVal = 0;
+
+ // get Kernel Descriptor if present.
+ // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
+ struct KernDescValType {
+ uint16_t Version;
+ uint16_t TSize;
+ uint16_t WG_Size;
+ uint8_t Mode;
+ uint8_t HostServices;
+ uint8_t MaxParallelLevel;
+ };
+ struct KernDescValType KernDescVal;
+ std::string KernDescNameStr(e->name);
+ KernDescNameStr += "_kern_desc";
+ const char *KernDescName = KernDescNameStr.c_str();
+
+ void *KernDescPtr;
+ uint32_t KernDescSize;
+ void *CallStackAddr;
+ err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+ KernDescName, &KernDescPtr, &KernDescSize);
+
+ if (err == ATMI_STATUS_SUCCESS) {
+ if ((size_t)KernDescSize != sizeof(KernDescVal))
+ DP("Loading global computation properties '%s' - size mismatch (%u != "
+ "%lu)\n",
+ KernDescName, KernDescSize, sizeof(KernDescVal));
+
+ memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
+
+ // Check structure size against recorded size.
+ if ((size_t)KernDescSize != KernDescVal.TSize)
+ DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
+ sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
+
+ DP("After loading global for %s KernDesc \n", KernDescName);
+ DP("KernDesc: Version: %d\n", KernDescVal.Version);
+ DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
+ DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
+ DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
+ DP("KernDesc: HostServices: %x\n", KernDescVal.HostServices);
+ DP("KernDesc: MaxParallelLevel: %x\n", KernDescVal.MaxParallelLevel);
+
+ // gather location of callStack and size of struct
+ MaxParLevVal = KernDescVal.MaxParallelLevel;
+ if (MaxParLevVal > 0) {
+ uint32_t varsize;
+ const char *CsNam = "omptarget_nest_par_call_stack";
+ err = atmi_interop_hsa_get_symbol_info(place, CsNam, &CallStackAddr,
+ &varsize);
+ if (err != ATMI_STATUS_SUCCESS) {
+ fprintf(stderr, "Addr of %s failed\n", CsNam);
+ return NULL;
+ }
+ void *StructSizePtr;
+ const char *SsNam = "omptarget_nest_par_call_struct_size";
+ err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+ SsNam, &StructSizePtr, &varsize);
+ if ((err != ATMI_STATUS_SUCCESS) ||
+ (varsize != sizeof(TgtStackItemSize))) {
+ fprintf(stderr, "Addr of %s failed\n", SsNam);
+ return NULL;
+ }
+ memcpy(&TgtStackItemSize, StructSizePtr, sizeof(TgtStackItemSize));
+ DP("Size of our struct is %d\n", TgtStackItemSize);
+ }
+
+ // Get ExecMode
+ ExecModeVal = KernDescVal.Mode;
+ DP("ExecModeVal %d\n", ExecModeVal);
+ if (KernDescVal.WG_Size == 0) {
+ KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
+ DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
+ }
+ WGSizeVal = KernDescVal.WG_Size;
+ DP("WGSizeVal %d\n", WGSizeVal);
+ check("Loading KernDesc computation property", err);
+ } else {
+ DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
+
+ // Generic
+ std::string ExecModeNameStr(e->name);
+ ExecModeNameStr += "_exec_mode";
+ const char *ExecModeName = ExecModeNameStr.c_str();
+
+ void *ExecModePtr;
+ uint32_t varsize;
+ err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+ ExecModeName, &ExecModePtr, &varsize);
+
+ if (err == ATMI_STATUS_SUCCESS) {
+ if ((size_t)varsize != sizeof(int8_t)) {
+ DP("Loading global computation properties '%s' - size mismatch(%u != "
+ "%lu)\n",
+ ExecModeName, varsize, sizeof(int8_t));
+ return NULL;
+ }
+
+ memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
+
+ DP("After loading global for %s ExecMode = %d\n", ExecModeName,
+ ExecModeVal);
+
+ if (ExecModeVal < 0 || ExecModeVal > 1) {
+ DP("Error wrong exec_mode value specified in HSA code object file: "
+ "%d\n",
+ ExecModeVal);
+ return NULL;
+ }
+ } else {
+ DP("Loading global exec_mode '%s' - symbol missing, using default "
+ "value "
+ "GENERIC (1)\n",
+ ExecModeName);
+ }
+ check("Loading computation property", err);
+
+ // Flat group size
+ std::string WGSizeNameStr(e->name);
+ WGSizeNameStr += "_wg_size";
+ const char *WGSizeName = WGSizeNameStr.c_str();
+
+ void *WGSizePtr;
+ uint32_t WGSize;
+ err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+ WGSizeName, &WGSizePtr, &WGSize);
+
+ if (err == ATMI_STATUS_SUCCESS) {
+ if ((size_t)WGSize != sizeof(int16_t)) {
+ DP("Loading global computation properties '%s' - size mismatch (%u "
+ "!= "
+ "%lu)\n",
+ WGSizeName, WGSize, sizeof(int16_t));
+ return NULL;
+ }
+
+ memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
+
+ DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
+
+ if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
+ WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
+ DP("Error wrong WGSize value specified in HSA code object file: "
+ "%d\n",
+ WGSizeVal);
+ WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+ }
+ } else {
+ DP("Warning: Loading WGSize '%s' - symbol not found, "
+ "using default value %d\n",
+ WGSizeName, WGSizeVal);
+ }
+
+ check("Loading WGSize computation property", err);
+ }
+
+ KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, MaxParLevVal,
+ device_id, CallStackAddr, e->name,
+ kernarg_segment_size));
+ __tgt_offload_entry entry = *e;
+ entry.addr = (void *)&KernelsList.back();
+ DeviceInfo.addOffloadEntry(device_id, entry);
+ DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
+ }
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) {
+ void *ptr = NULL;
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id));
+ DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
+ (long long unsigned)(Elf64_Addr)ptr);
+ ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL;
+ return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ __tgt_async_info async_info;
+ int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
+ if (rc != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size,
+ __tgt_async_info *async_info_ptr) {
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ if (async_info_ptr) {
+ initAsyncInfoPtr(async_info_ptr);
+ return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+ } else {
+ return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
+ }
+}
+
+int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ __tgt_async_info async_info;
+ int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
+ if (rc != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
+ void *tgt_ptr, int64_t size,
+ __tgt_async_info *async_info_ptr) {
+ assert(async_info_ptr && "async_info is nullptr");
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ initAsyncInfoPtr(async_info_ptr);
+ return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ atmi_status_t err;
+ DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
+ err = atmi_free(tgt_ptr);
+ if (err != ATMI_STATUS_SUCCESS) {
+ DP("Error when freeing CUDA memory\n");
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+// Determine launch values for threadsPerGroup and num_groups.
+// Outputs: treadsPerGroup, num_groups
+// Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode,
+// EnvTeamLimit, EnvNumTeams, num_teams, thread_limit,
+// loop_tripcount.
+void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
+ int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
+ int num_teams, int thread_limit, uint64_t loop_tripcount) {
+
+ int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
+ ? DeviceInfo.EnvMaxTeamsDefault
+ : DeviceInfo.Max_Teams;
+ if (Max_Teams > DeviceInfo.HardTeamLimit)
+ Max_Teams = DeviceInfo.HardTeamLimit;
+
+ if (print_kernel_trace > 1) {
+ fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
+ RTLDeviceInfoTy::Max_Teams);
+ fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
+ fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
+ RTLDeviceInfoTy::Warp_Size);
+ fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
+ RTLDeviceInfoTy::Max_WG_Size);
+ fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n",
+ RTLDeviceInfoTy::Default_WG_Size);
+ fprintf(stderr, "thread_limit: %d\n", thread_limit);
+ fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+ fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
+ }
+ // check for thread_limit() clause
+ if (thread_limit > 0) {
+ threadsPerGroup = thread_limit;
+ DP("Setting threads per block to requested %d\n", thread_limit);
+ if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
+ threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
+ DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
+ }
+ if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
+ threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
+ DP("Setting threads per block to maximum %d\n", threadsPerGroup);
+ }
+ }
+ // check flat_max_work_group_size attr here
+ if (threadsPerGroup > ConstWGSize) {
+ threadsPerGroup = ConstWGSize;
+ DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
+ threadsPerGroup);
+ }
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+ DP("Preparing %d threads\n", threadsPerGroup);
+
+ // Set default num_groups (teams)
+ if (DeviceInfo.EnvTeamLimit > 0)
+ num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit)
+ ? Max_Teams
+ : DeviceInfo.EnvTeamLimit;
+ else
+ num_groups = Max_Teams;
+ DP("Set default num of groups %d\n", num_groups);
+
+ if (print_kernel_trace > 1) {
+ fprintf(stderr, "num_groups: %d\n", num_groups);
+ fprintf(stderr, "num_teams: %d\n", num_teams);
+ }
+
+ // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
+ // This reduction is typical for default case (no thread_limit clause).
+ // or when user goes crazy with num_teams clause.
+ // FIXME: We cant distinguish between a constant or variable thread limit.
+ // So we only handle constant thread_limits.
+ if (threadsPerGroup >
+ RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024
+ // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
+ // here?
+ num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
+
+ // check for num_teams() clause
+ if (num_teams > 0) {
+ num_groups = (num_teams < num_groups) ? num_teams : num_groups;
+ }
+ if (print_kernel_trace > 1) {
+ fprintf(stderr, "num_groups: %d\n", num_groups);
+ fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
+ fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit);
+ }
+
+ if (DeviceInfo.EnvNumTeams > 0) {
+ num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams
+ : num_groups;
+ DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
+ } else if (DeviceInfo.EnvTeamLimit > 0) {
+ num_groups = (DeviceInfo.EnvTeamLimit < num_groups)
+ ? DeviceInfo.EnvTeamLimit
+ : num_groups;
+ DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit);
+ } else {
+ if (num_teams <= 0) {
+ if (loop_tripcount > 0) {
+ if (ExecutionMode == SPMD) {
+ // round up to the nearest integer
+ num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
+ } else {
+ num_groups = loop_tripcount;
+ }
+ DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+ "threads per block %d\n",
+ num_groups, loop_tripcount, threadsPerGroup);
+ }
+ } else {
+ num_groups = num_teams;
+ }
+ if (num_groups > Max_Teams) {
+ num_groups = Max_Teams;
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
+ Max_Teams);
+ }
+ if (num_groups > num_teams && num_teams > 0) {
+ num_groups = num_teams;
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
+ num_groups, num_teams);
+ }
+ }
+
+ // num_teams clause always honored, no matter what, unless DEFAULT is active.
+ if (num_teams > 0) {
+ num_groups = num_teams;
+ // Cap num_groups to EnvMaxTeamsDefault if set.
+ if (DeviceInfo.EnvMaxTeamsDefault > 0 &&
+ num_groups > DeviceInfo.EnvMaxTeamsDefault)
+ num_groups = DeviceInfo.EnvMaxTeamsDefault;
+ }
+ if (print_kernel_trace > 1) {
+ fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+ fprintf(stderr, "num_groups: %d\n", num_groups);
+ fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
+ }
+ DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
+ threadsPerGroup);
+}
+
+static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
+ int ThreadsPerGroup,
+ int device_id,
+ void *CallStackAddr, int SPMD) {
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "MaxParLevel %d SPMD %d NumGroups %d NumThrds %d\n",
+ MaxParLevel, SPMD, NumGroups, ThreadsPerGroup);
+ // Total memory needed is Teams * Threads * ParLevels
+ size_t NestedMemSize =
+ MaxParLevel * NumGroups * ThreadsPerGroup * TgtStackItemSize * 4;
+
+ if (print_kernel_trace > 1)
+ fprintf(stderr, "NestedMemSize %ld \n", NestedMemSize);
+ assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+ void *TgtPtr = NULL;
+ atmi_status_t err =
+ atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
+ err = atmi_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
+ if (print_kernel_trace > 2)
+ fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
+ (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);
+ if (err != ATMI_STATUS_SUCCESS) {
+ fprintf(stderr, "Mem not wrtten to target, err %d\n", err);
+ }
+ return TgtPtr; // we need to free this after kernel.
+}
+
+static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
+ uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+ bool full = true;
+ while (full) {
+ full =
+ packet_id >= (queue->size + hsa_queue_load_read_index_acquire(queue));
+ }
+ return packet_id;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args,
+ ptr
diff _t *tgt_offsets,
+ int32_t arg_num, int32_t num_teams,
+ int32_t thread_limit,
+ uint64_t loop_tripcount) {
+ // Set the context we are using
+ // update thread limit content in gpu memory if un-initialized or specified
+ // from host
+
+ DP("Run target team region thread_limit %d\n", thread_limit);
+
+ // All args are references.
+ std::vector<void *> args(arg_num);
+ std::vector<void *> ptrs(arg_num);
+
+ DP("Arg_num: %d\n", arg_num);
+ for (int32_t i = 0; i < arg_num; ++i) {
+ ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+ args[i] = &ptrs[i];
+ DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
+ }
+
+ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+ /*
+ * Set limit based on ThreadsPerGroup and GroupsPerDevice
+ */
+ int num_groups = 0;
+
+ int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
+
+ getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize,
+ KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit,
+ DeviceInfo.EnvNumTeams,
+ num_teams, // From run_region arg
+ thread_limit, // From run_region arg
+ loop_tripcount // From run_region arg
+ );
+
+ void *TgtCallStack = NULL;
+ if (KernelInfo->MaxParLevel > 0)
+ TgtCallStack = AllocateNestedParallelCallMemory(
+ KernelInfo->MaxParLevel, num_groups, threadsPerGroup,
+ KernelInfo->device_id, KernelInfo->CallStackAddr,
+ KernelInfo->ExecutionMode);
+
+ if (print_kernel_trace > 0)
+ // enum modes are SPMD, GENERIC, NONE 0,1,2
+ fprintf(stderr,
+ "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
+ "reqd:(%4dX%4d) n:%s\n",
+ device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
+ arg_num, num_groups, threadsPerGroup, num_teams, thread_limit,
+ KernelInfo->Name);
+
+ // Run on the device.
+ {
+ hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];
+ uint64_t packet_id = acquire_available_packet_id(queue);
+
+ const uint32_t mask = queue->size - 1; // size is a power of 2
+ hsa_kernel_dispatch_packet_t *packet =
+ (hsa_kernel_dispatch_packet_t *)queue->base_address +
+ (packet_id & mask);
+
+ // packet->header is written last
+ packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ packet->workgroup_size_x = threadsPerGroup;
+ packet->workgroup_size_y = 1;
+ packet->workgroup_size_z = 1;
+ packet->reserved0 = 0;
+ packet->grid_size_x = num_groups * threadsPerGroup;
+ packet->grid_size_y = 1;
+ packet->grid_size_z = 1;
+ packet->private_segment_size = 0;
+ packet->group_segment_size = 0;
+ packet->kernel_object = 0;
+ packet->kernarg_address = 0; // use the block allocator
+ packet->reserved2 = 0; // atmi writes id_ here
+ packet->completion_signal = {0}; // may want a pool of signals
+
+ std::string kernel_name = std::string(KernelInfo->Name);
+ {
+ assert(KernelInfoTable[device_id].find(kernel_name) !=
+ KernelInfoTable[device_id].end());
+ auto it = KernelInfoTable[device_id][kernel_name];
+ packet->kernel_object = it.kernel_object;
+ packet->private_segment_size = it.private_segment_size;
+ packet->group_segment_size = it.group_segment_size;
+ assert(arg_num == (int)it.num_args);
+ }
+
+ KernelArgPool *ArgPool = nullptr;
+ {
+ auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
+ if (it != KernelArgPoolMap.end()) {
+ ArgPool = (it->second).get();
+ }
+ }
+ if (!ArgPool) {
+ fprintf(stderr, "Warning: No ArgPool for %s on device %d\n",
+ KernelInfo->Name, device_id);
+ }
+ {
+ void *kernarg = nullptr;
+ if (ArgPool) {
+ assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
+ kernarg = ArgPool->allocate(arg_num);
+ }
+ if (!kernarg) {
+ printf("Allocate kernarg failed\n");
+ exit(1);
+ }
+
+ // Copy explicit arguments
+ for (int i = 0; i < arg_num; i++) {
+ memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
+ }
+
+ // Initialize implicit arguments. ATMI seems to leave most fields
+ // uninitialized
+ atmi_implicit_args_t *impl_args =
+ reinterpret_cast<atmi_implicit_args_t *>(
+ static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
+ memset(impl_args, 0,
+ sizeof(atmi_implicit_args_t)); // may not be necessary
+ impl_args->offset_x = 0;
+ impl_args->offset_y = 0;
+ impl_args->offset_z = 0;
+
+ packet->kernarg_address = kernarg;
+ }
+
+ {
+ hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
+ if (s.handle == 0) {
+ printf("Failed to get signal instance\n");
+ exit(1);
+ }
+ packet->completion_signal = s;
+ hsa_signal_store_relaxed(packet->completion_signal, 1);
+ }
+
+ core::packet_store_release(
+ reinterpret_cast<uint32_t *>(packet),
+ core::create_header(HSA_PACKET_TYPE_KERNEL_DISPATCH, 0,
+ ATMI_FENCE_SCOPE_SYSTEM, ATMI_FENCE_SCOPE_SYSTEM),
+ packet->setup);
+
+ hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+ while (hsa_signal_wait_acquire(packet->completion_signal,
+ HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+ HSA_WAIT_STATE_BLOCKED) != 0)
+ ;
+
+ assert(ArgPool);
+ ArgPool->deallocate(packet->kernarg_address);
+ DeviceInfo.FreeSignalPool.push(packet->completion_signal);
+ }
+
+ DP("Kernel completed\n");
+ // Free call stack for nested
+ if (TgtCallStack)
+ atmi_free(TgtCallStack);
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptr
diff _t *tgt_offsets,
+ int32_t arg_num) {
+ // use one team and one thread
+ // fix thread num
+ int32_t team_num = 1;
+ int32_t thread_limit = 0; // use default
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, team_num,
+ thread_limit, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+ void *tgt_entry_ptr, void **tgt_args,
+ ptr
diff _t *tgt_offsets,
+ int32_t arg_num,
+ __tgt_async_info *async_info_ptr) {
+ assert(async_info_ptr && "async_info is nullptr");
+ initAsyncInfoPtr(async_info_ptr);
+
+ // use one team and one thread
+ // fix thread num
+ int32_t team_num = 1;
+ int32_t thread_limit = 0; // use default
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, team_num,
+ thread_limit, 0);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+ __tgt_async_info *async_info_ptr) {
+ assert(async_info_ptr && "async_info is nullptr");
+
+ // Cuda asserts that async_info_ptr->Queue is non-null, but this invariant
+ // is not ensured by devices.cpp for amdgcn
+ // assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
+ if (async_info_ptr->Queue) {
+ finiAsyncInfoPtr(async_info_ptr);
+ }
+ return OFFLOAD_SUCCESS;
+}
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 61f8cc15fee7..8e101b7a7a5b 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -27,7 +27,9 @@ static const char *RTLNames[] = {
/* PowerPC target */ "libomptarget.rtl.ppc64.so",
/* x86_64 target */ "libomptarget.rtl.x86_64.so",
/* CUDA target */ "libomptarget.rtl.cuda.so",
- /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
+ /* AArch64 target */ "libomptarget.rtl.aarch64.so",
+ /* AMDGPU target */ "libomptarget.rtl.amdgpu.so",
+};
RTLsTy *RTLs;
std::mutex *RTLsMtx;
More information about the Openmp-commits
mailing list