[Openmp-commits] [openmp] e90ab91 - [OpenMP] Delete old plugins
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Wed Jul 5 15:39:58 PDT 2023
Author: Joseph Huber
Date: 2023-07-05T17:39:47-05:00
New Revision: e90ab9148baccda79874267c257bbba8d56e0600
URL: https://github.com/llvm/llvm-project/commit/e90ab9148baccda79874267c257bbba8d56e0600
DIFF: https://github.com/llvm/llvm-project/commit/e90ab9148baccda79874267c257bbba8d56e0600.diff
LOG: [OpenMP] Delete old plugins
It's time to remove the old plugins as the next-gen has already been set
to default in LLVM 16.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D142820
Added:
openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp
openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h
openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp
openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h
openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Modified:
openmp/docs/design/Runtimes.rst
openmp/libomptarget/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
openmp/libomptarget/src/rtl.cpp
Removed:
openmp/libomptarget/plugins/CMakeLists.txt
openmp/libomptarget/plugins/aarch64/CMakeLists.txt
openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
openmp/libomptarget/plugins/amdgpu/impl/data.cpp
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
openmp/libomptarget/plugins/amdgpu/impl/internal.h
openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
openmp/libomptarget/plugins/amdgpu/impl/rt.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins/common/CMakeLists.txt
openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp
openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h
openmp/libomptarget/plugins/common/elf_common/elf_common.cpp
openmp/libomptarget/plugins/common/elf_common/elf_common.h
openmp/libomptarget/plugins/cuda/CMakeLists.txt
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
openmp/libomptarget/plugins/cuda/src/rtl.cpp
openmp/libomptarget/plugins/exports
openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
openmp/libomptarget/plugins/ppc64/CMakeLists.txt
openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
openmp/libomptarget/plugins/x86_64/CMakeLists.txt
################################################################################
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 978ef88dffb56c..c26ced06113fcc 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1378,7 +1378,7 @@ clause. Examples for both are given below.
.. code-block:: console
$ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c
- $ env LIBOMPTARGET_NEXTGEN_PLUGINS=1 ./shared
+ $ env ./shared
.. _libomptarget_device_debugging:
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 5564e67437cd35..61bcb46504d43b 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -105,7 +105,6 @@ set(LIBOMPTARGET_LLVM_LIBRARY_DIR "${LLVM_LIBRARY_DIR}" CACHE STRING
"Path to folder containing llvm library libomptarget.so")
# Build offloading plugins and device RTLs if they are available.
-add_subdirectory(plugins)
add_subdirectory(plugins-nextgen)
add_subdirectory(DeviceRTL)
add_subdirectory(tools)
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index a51da395680ba6..5b131efa3832e2 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -12,12 +12,12 @@
add_subdirectory(common)
-# void build_generic_elf64_nextgen(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
# - build a plugin for an ELF based generic 64-bit target based on libffi.
# - tmachine: name of the machine processor as used in the cmake build system.
# - tmachine_name: name of the machine to be printed with the debug messages.
# - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64_nextgen tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
@@ -36,7 +36,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
# Define target regiple
add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
- add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen"
+ add_llvm_library("omptarget.rtl.${tmachine_libname}"
SHARED
${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
@@ -58,23 +58,23 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
)
if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
- target_link_libraries("omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE
+ target_link_libraries("omptarget.rtl.${tmachine_libname}" PRIVATE
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
endif()
# Install plugin under the lib destination folder.
- install(TARGETS "omptarget.rtl.${tmachine_libname}.nextgen"
+ install(TARGETS "omptarget.rtl.${tmachine_libname}"
LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
- set_target_properties("omptarget.rtl.${tmachine_libname}.nextgen" PROPERTIES
+ set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
CXX_VISIBILITY_PRESET protected)
- target_include_directories( "omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE
+ target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE
${LIBOMPTARGET_INCLUDE_DIR}
${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
list(APPEND LIBOMPTARGET_TESTED_PLUGINS
- "omptarget.rtl.${tmachine_libname}.nextgen")
+ "omptarget.rtl.${tmachine_libname}")
else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: libffi dependency not found.")
diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
index 35eafe3b5cc915..2c2b753590e205 100644
--- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -11,7 +11,7 @@
##===----------------------------------------------------------------------===##
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+ build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
else()
libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
endif()
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
index d0d5f54c667da1..83f13ed3ccff22 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -52,8 +52,8 @@ if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64)
else()
libomptarget_say("Building AMDGPU NextGen plugin for dlopened libhsa")
- include_directories(../../plugins/amdgpu/dynamic_hsa)
- set(LIBOMPTARGET_EXTRA_SOURCE ../../plugins/amdgpu/dynamic_hsa/hsa.cpp)
+ include_directories(dynamic_hsa)
+ set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp)
set(LIBOMPTARGET_DEP_LIBRARIES)
endif()
@@ -66,7 +66,7 @@ else()
set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
endif()
-add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED
+add_llvm_library(omptarget.rtl.amdgpu SHARED
src/rtl.cpp
${LIBOMPTARGET_EXTRA_SOURCE}
@@ -91,16 +91,16 @@ add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED
)
if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
- target_link_libraries(omptarget.rtl.amdgpu.nextgen PRIVATE OMPT)
+ target_link_libraries(omptarget.rtl.amdgpu PRIVATE OMPT)
endif()
if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
- target_link_libraries(omptarget.rtl.amdgpu.nextgen PRIVATE
+ target_link_libraries(omptarget.rtl.amdgpu PRIVATE
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
endif()
target_include_directories(
- omptarget.rtl.amdgpu.nextgen
+ omptarget.rtl.amdgpu
PRIVATE
${LIBOMPTARGET_INCLUDE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/utils
@@ -108,7 +108,7 @@ target_include_directories(
# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu.nextgen PROPERTIES
+install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.amdgpu PROPERTIES
INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
CXX_VISIBILITY_PRESET protected)
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
similarity index 100%
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
similarity index 100%
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
similarity index 100%
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
index f0645d0d175381..368f70984b0bde 100644
--- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -12,3 +12,5 @@
add_subdirectory(OMPT)
add_subdirectory(PluginInterface)
+add_subdirectory(MemoryManager)
+add_subdirectory(elf_common)
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt
similarity index 100%
rename from openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
similarity index 100%
rename from openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
similarity index 94%
rename from openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
index 54d28bc66d634b..4d3a4019f1dd5d 100644
--- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
@@ -11,7 +11,7 @@
##===----------------------------------------------------------------------===##
# NOTE: Don't try to build `elf_common` using `add_llvm_library`.
-# See openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+# See openmp/libomptarget/plugins/common/PluginInterface/CMakeLists.txt
# for more explanation.
add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp)
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp
similarity index 100%
rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h
similarity index 100%
rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp
similarity index 100%
rename from openmp/libomptarget/plugins/common/elf_common/elf_common.cpp
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h
similarity index 100%
rename from openmp/libomptarget/plugins/common/elf_common/elf_common.h
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
index 736a961f56181e..d5b9b8955ebae7 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
@@ -26,7 +26,7 @@ libomptarget_say("Building CUDA NextGen offloading plugin.")
set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
-add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
+add_llvm_library(omptarget.rtl.cuda SHARED
src/rtl.cpp
LINK_COMPONENTS
@@ -43,33 +43,33 @@ add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
)
if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
- target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE OMPT)
+ target_link_libraries(omptarget.rtl.cuda PRIVATE OMPT)
endif()
if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
- target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE
+ target_link_libraries(omptarget.rtl.cuda PRIVATE
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs")
endif()
if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
libomptarget_say("Building CUDA plugin linked against libcuda")
- target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE CUDA::cuda_driver)
+ target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
else()
libomptarget_say("Building CUDA plugin for dlopened libcuda")
- target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda)
- target_sources(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda/cuda.cpp)
+ target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
+ target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
endif()
# Define debug prefix. TODO: This should be automatized in the Debug.h but it
# requires changing the original plugins.
-target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE TARGET_NAME="CUDA")
-target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE DEBUG_PREFIX="TARGET CUDA RTL")
+target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA")
+target_compile_definitions(omptarget.rtl.cuda PRIVATE DEBUG_PREFIX="TARGET CUDA RTL")
-target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
+target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.cuda PROPERTIES
INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
CXX_VISIBILITY_PRESET protected)
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
similarity index 100%
rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
similarity index 100%
rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
index 8160f1d862245f..0cccc9cb82e42c 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -11,7 +11,7 @@
##===----------------------------------------------------------------------===##
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+ build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
else()
libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
endif()
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
index af13a56cb0d9bd..9461d79d145dfe 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -11,7 +11,7 @@
##===----------------------------------------------------------------------===##
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+ build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
else()
libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
endif()
diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
index 214444d5366d42..129d526a2ae784 100644
--- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -11,7 +11,7 @@
##===----------------------------------------------------------------------===##
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+ build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
else()
libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
endif()
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
deleted file mode 100644
index 0f3f372caede9f..00000000000000
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build plugins for the user system if available.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(common)
-
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
-# - build a plugin for an ELF based generic 64-bit target based on libffi.
-# - tmachine: name of the machine processor as used in the cmake build system.
-# - tmachine_name: name of the machine to be printed with the debug messages.
-# - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
- if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-
- libomptarget_say("Building ${tmachine_name} offloading plugin.")
-
- # Define macro to be used as prefix of the runtime messages for this target.
- add_definitions("-DTARGET_NAME=${tmachine_name}")
-
- # Define macro with the ELF ID for this target.
- add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-
- add_llvm_library("omptarget.rtl.${tmachine_libname}"
- SHARED
-
- ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${LIBOMPTARGET_INCLUDE_DIR}
- ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}
-
- LINK_LIBS
- PRIVATE
- elf_common
- ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
- ${OPENMP_PTHREAD_LIB}
- "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-
- NO_INSTALL_RPATH
- )
-
- # Install plugin under the lib destination folder.
- install(TARGETS "omptarget.rtl.${tmachine_libname}"
- LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
- set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
- INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
- CXX_VISIBILITY_PRESET protected)
-
- target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE
- ${LIBOMPTARGET_INCLUDE_DIR}
- ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
-
- list(APPEND LIBOMPTARGET_TESTED_PLUGINS
- "omptarget.rtl.${tmachine_libname}")
-
- # Report to the parent scope that we are building a plugin.
- set(LIBOMPTARGET_SYSTEM_TARGETS
- "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-LTO" PARENT_SCOPE)
- set(LIBOMPTARGET_TESTED_PLUGINS
- "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
- else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
- libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
- endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-else()
- libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
-endif()
-endmacro()
-
-add_subdirectory(aarch64)
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(ppc64)
-add_subdirectory(ppc64le)
-add_subdirectory(x86_64)
-
-# Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
deleted file mode 100644
index 350a56cb9a493b..00000000000000
--- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an aarch64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
-else()
- libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
deleted file mode 100644
index e8abfb8247936b..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is dual licensed under the MIT and the University of Illinois Open
-# Source Licenses. See LICENSE.txt for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an AMDGPU machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-################################################################################
-set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL
- "Whether to build AMDGPU plugin")
-if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN)
- libomptarget_say("Not building AMDGPU offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false")
- return()
-endif()
-
-# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-
-if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
- libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
- return()
-endif()
-
-################################################################################
-# Define the suffix for the runtime messaging dumps.
-add_definitions(-DTARGET_NAME=AMDGPU)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
- add_definitions(-DLITTLEENDIAN_CPU=1)
-endif()
-
-if(CMAKE_BUILD_TYPE MATCHES Debug)
- add_definitions(-DDEBUG)
-endif()
-
-set(LIBOMPTARGET_DLOPEN_LIBHSA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA})
-
-if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
- libomptarget_say("Building AMDGPU plugin linked against libhsa")
- set(LIBOMPTARGET_EXTRA_SOURCE)
- set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64)
-else()
- libomptarget_say("Building AMDGPU plugin for dlopened libhsa")
- include_directories(dynamic_hsa)
- set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp)
- set(LIBOMPTARGET_DEP_LIBRARIES)
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
- # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by
- # the dynamic linker at runtime. Therefore, allow the symbol to be undefined
- # when creating a shared library.
- set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined")
-else()
- set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
-endif()
-
-add_llvm_library(omptarget.rtl.amdgpu SHARED
- impl/impl.cpp
- impl/interop_hsa.cpp
- impl/data.cpp
- impl/get_elf_mach_gfx_name.cpp
- impl/system.cpp
- impl/msgpack.cpp
- src/rtl.cpp
- ${LIBOMPTARGET_EXTRA_SOURCE}
-
- ADDITIONAL_HEADER_DIRS
- ${LIBOMPTARGET_INCLUDE_DIR}
- ${CMAKE_CURRENT_SOURCE_DIR}/impl
- ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-
- LINK_COMPONENTS
- Support
- Object
-
- LINK_LIBS
- PRIVATE
- elf_common
- ${LIBOMPTARGET_DEP_LIBRARIES}
- ${OPENMP_PTHREAD_LIB}
- "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
- ${LDFLAGS_UNDEFINED}
-
- NO_INSTALL_RPATH
-)
-
-target_include_directories(
- omptarget.rtl.amdgpu
- PRIVATE
- ${LIBOMPTARGET_INCLUDE_DIR}
- ${CMAKE_CURRENT_SOURCE_DIR}/impl
- ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-)
-
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu PROPERTIES
- INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
- CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for hsa.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libhsa is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no hsa available
-option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS)
- # Report to the parent scope that we are building a plugin for amdgpu
- set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
- list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu")
- set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
- libomptarget_say("Not generating AMDGPU tests, no supported devices detected. Use 'LIBOMPTARGET_FORCE_AMDGPU_TESTS' to override.")
- return()
-endif()
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
deleted file mode 100644
index 67942a8942f34d..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===--- amdgpu/impl/data.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-#include "rt.h"
-#include <cassert>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-
-using core::TaskImpl;
-
-namespace core {
-namespace Runtime {
-hsa_status_t HostMalloc(void **ptr, size_t size,
- hsa_amd_memory_pool_t MemoryPool) {
- hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr);
- DP("Malloced %p\n", *ptr);
- if (err == HSA_STATUS_SUCCESS) {
- err = core::allow_access_to_all_gpu_agents(*ptr);
- }
- return err;
-}
-
-hsa_status_t Memfree(void *ptr) {
- hsa_status_t err = hsa_amd_memory_pool_free(ptr);
- DP("Freed %p\n", ptr);
- return err;
-}
-} // namespace Runtime
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
deleted file mode 100644
index 69f2a716a8fd68..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.cpp -------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "get_elf_mach_gfx_name.h"
-
-// This header conflicts with the system elf.h (macros vs enums of the same
-// identifier) and contains more up to date values for the enum checked here.
-// rtl.cpp uses the system elf.h.
-#include "llvm/BinaryFormat/ELF.h"
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags) {
- using namespace llvm::ELF;
- uint32_t Gfx = (EFlags & EF_AMDGPU_MACH);
- switch (Gfx) {
- case EF_AMDGPU_MACH_AMDGCN_GFX801:
- return "gfx801";
- case EF_AMDGPU_MACH_AMDGCN_GFX802:
- return "gfx802";
- case EF_AMDGPU_MACH_AMDGCN_GFX803:
- return "gfx803";
- case EF_AMDGPU_MACH_AMDGCN_GFX805:
- return "gfx805";
- case EF_AMDGPU_MACH_AMDGCN_GFX810:
- return "gfx810";
- case EF_AMDGPU_MACH_AMDGCN_GFX900:
- return "gfx900";
- case EF_AMDGPU_MACH_AMDGCN_GFX902:
- return "gfx902";
- case EF_AMDGPU_MACH_AMDGCN_GFX904:
- return "gfx904";
- case EF_AMDGPU_MACH_AMDGCN_GFX906:
- return "gfx906";
- case EF_AMDGPU_MACH_AMDGCN_GFX908:
- return "gfx908";
- case EF_AMDGPU_MACH_AMDGCN_GFX909:
- return "gfx909";
- case EF_AMDGPU_MACH_AMDGCN_GFX90A:
- return "gfx90a";
- case EF_AMDGPU_MACH_AMDGCN_GFX90C:
- return "gfx90c";
- case EF_AMDGPU_MACH_AMDGCN_GFX940:
- return "gfx940";
- case EF_AMDGPU_MACH_AMDGCN_GFX1010:
- return "gfx1010";
- case EF_AMDGPU_MACH_AMDGCN_GFX1011:
- return "gfx1011";
- case EF_AMDGPU_MACH_AMDGCN_GFX1012:
- return "gfx1012";
- case EF_AMDGPU_MACH_AMDGCN_GFX1013:
- return "gfx1013";
- case EF_AMDGPU_MACH_AMDGCN_GFX1030:
- return "gfx1030";
- case EF_AMDGPU_MACH_AMDGCN_GFX1031:
- return "gfx1031";
- case EF_AMDGPU_MACH_AMDGCN_GFX1032:
- return "gfx1032";
- case EF_AMDGPU_MACH_AMDGCN_GFX1033:
- return "gfx1033";
- case EF_AMDGPU_MACH_AMDGCN_GFX1034:
- return "gfx1034";
- case EF_AMDGPU_MACH_AMDGCN_GFX1035:
- return "gfx1035";
- case EF_AMDGPU_MACH_AMDGCN_GFX1036:
- return "gfx1036";
- case EF_AMDGPU_MACH_AMDGCN_GFX1100:
- return "gfx1100";
- case EF_AMDGPU_MACH_AMDGCN_GFX1101:
- return "gfx1101";
- case EF_AMDGPU_MACH_AMDGCN_GFX1102:
- return "gfx1102";
- case EF_AMDGPU_MACH_AMDGCN_GFX1103:
- return "gfx1103";
- default:
- return "--unknown gfx";
- }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
deleted file mode 100644
index 177963e1b8b5c3..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.h ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED
-#define GET_ELF_MACH_GFX_NAME_H_INCLUDED
-
-#include <stdint.h>
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags);
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h b/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
deleted file mode 100644
index 241d589533d08e..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/hsa_api.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_HSA_API_H_INCLUDED
-#define AMDGPU_HSA_API_H_INCLUDED
-
-#if defined(__has_include)
-#if __has_include("hsa/hsa.h")
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#elif __has_include("hsa.h")
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-#else
-#include "hsa/hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-
-
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
deleted file mode 100644
index 5299e3d238149c..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "rt.h"
-#include <memory>
-
-/*
- * Data
- */
-
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
- hsa_status_t err = HSA_STATUS_SUCCESS;
- hsa_amd_pointer_info_t info;
- info.size = sizeof(hsa_amd_pointer_info_t);
- err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
- /*num_agents_accessible=*/nullptr,
- /*accessible=*/nullptr);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error when getting pointer info\n");
- return err;
- }
-
- if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
- // When user passes in a basePtr+offset we need to fix the
- // locked pointer to include the offset: ROCr always returns
- // the base locked address, not the shifted one.
- if ((char *)info.hostBaseAddress <= (char *)ptr &&
- (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
- *agentBaseAddress =
- (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
- (uint64_t)info.hostBaseAddress);
- else // address is already device-agent accessible, no need to compute
- // offset
- *agentBaseAddress = ptr;
- } else
- *agentBaseAddress = nullptr;
-
- return HSA_STATUS_SUCCESS;
-}
-
-// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
-static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
- hsa_agent_t agent, const void *src,
- size_t size) {
- const hsa_signal_value_t init = 1;
- const hsa_signal_value_t success = 0;
- hsa_signal_store_screlease(signal, init);
-
- hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
- nullptr, signal);
- if (err != HSA_STATUS_SUCCESS)
- return err;
-
- // async_copy reports success by decrementing and failure by setting to < 0
- hsa_signal_value_t got = init;
- while (got == init)
- got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
- UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-
- if (got != success)
- return HSA_STATUS_ERROR;
-
- return err;
-}
-
-struct implFreePtrDeletor {
- void operator()(void *p) {
- core::Runtime::Memfree(p); // ignore failure to free
- }
-};
-
-enum CopyDirection { H2D, D2H };
-
-static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
- hsa_signal_t signal, void *dest,
- hsa_agent_t agent, void *src,
- void *lockingPtr, size_t size) {
- void *lockedPtr = nullptr;
- hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
- bool HostPtrIsLocked = true;
- if (err != HSA_STATUS_SUCCESS)
- return err;
- if (!lockedPtr) { // not locked
- HostPtrIsLocked = false;
- hsa_agent_t agents[1] = {agent};
- err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
- (void **)&lockedPtr);
- if (err != HSA_STATUS_SUCCESS)
- return err;
- DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
- lockingPtr, lockedPtr, size);
- }
-
- switch (direction) {
- case H2D:
- err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
- break;
- case D2H:
- err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
- break;
- }
-
- if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
- // do not leak locked host pointers, but discard potential error message
- // because the initial error was in the copy function
- hsa_amd_memory_unlock(lockingPtr);
- return err;
- }
-
- // unlock only if not user locked
- if (!HostPtrIsLocked)
- err = hsa_amd_memory_unlock(lockingPtr);
- if (err != HSA_STATUS_SUCCESS)
- return err;
-
- return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
- void *hostSrc, size_t size,
- hsa_agent_t device_agent,
- hsa_amd_memory_pool_t MemoryPool) {
- hsa_status_t err;
-
- err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
- device_agent, hostSrc, hostSrc, size);
-
- if (err == HSA_STATUS_SUCCESS)
- return err;
-
- // async memcpy sometimes fails in situations where
- // allocate + copy succeeds. Looks like it might be related to
- // locking part of a read only segment. Fall back for now.
- void *tempHostPtr;
- hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
- if (ret != HSA_STATUS_SUCCESS) {
- DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
- return ret;
- }
- std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
- memcpy(tempHostPtr, hostSrc, size);
-
- return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
- device_agent, tempHostPtr, tempHostPtr, size);
-}
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
- void *deviceSrc, size_t size,
- hsa_agent_t deviceAgent,
- hsa_amd_memory_pool_t MemoryPool) {
- hsa_status_t err;
-
- // device has always visibility over both pointers, so use that
- err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
- deviceSrc, hostDest, size);
-
- if (err == HSA_STATUS_SUCCESS)
- return err;
-
- // hsa_memory_copy sometimes fails in situations where
- // allocate + copy succeeds. Looks like it might be related to
- // locking part of a read only segment. Fall back for now.
- void *tempHostPtr;
- hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
- if (ret != HSA_STATUS_SUCCESS) {
- DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
- return ret;
- }
- std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
-
- err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
- deviceAgent, deviceSrc, tempHostPtr, size);
- if (err != HSA_STATUS_SUCCESS)
- return HSA_STATUS_ERROR;
-
- memcpy(hostDest, tempHostPtr, size);
- return HSA_STATUS_SUCCESS;
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
deleted file mode 100644
index 671044089e1075..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/impl_runtime.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_IMPL_RUNTIME_H_
-#define INCLUDE_IMPL_RUNTIME_H_
-
-#include "hsa_api.h"
-
-extern "C" {
-
-// Check if pointer ptr is already locked
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress);
-
-hsa_status_t impl_module_register_from_memory_to_place(
- void *module_bytes, size_t module_size, int DeviceId,
- hsa_status_t (*on_deserialized_data)(void *data, size_t size,
- void *cb_state),
- void *cb_state);
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
- void *hostSrc, size_t size,
- hsa_agent_t device_agent,
- hsa_amd_memory_pool_t MemoryPool);
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
- size_t size, hsa_agent_t device_agent,
- hsa_amd_memory_pool_t MemoryPool);
-}
-
-#endif // INCLUDE_IMPL_RUNTIME_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
deleted file mode 100644
index dc94b0ed01f20a..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-//===--- amdgpu/impl/internal.h ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#include <inttypes.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cstring>
-#include <map>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "hsa_api.h"
-
-#include "impl_runtime.h"
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#include "Debug.h"
-
-#define MAX_NUM_KERNELS (1024 * 16)
-
-// ---------------------- Kernel Start -------------
-typedef struct atl_kernel_info_s {
- uint64_t kernel_object;
- uint32_t group_segment_size;
- uint32_t private_segment_size;
- uint32_t sgpr_count;
- uint32_t vgpr_count;
- uint32_t sgpr_spill_count;
- uint32_t vgpr_spill_count;
- uint32_t kernel_segment_size;
- uint32_t explicit_argument_count;
- uint32_t implicit_argument_count;
-} atl_kernel_info_t;
-
-typedef struct atl_symbol_info_s {
- uint64_t addr;
- uint32_t size;
-} atl_symbol_info_t;
-
-// ---------------------- Kernel End -------------
-
-namespace core {
-class TaskgroupImpl;
-class TaskImpl;
-class Kernel;
-class KernelImpl;
-} // namespace core
-
-struct SignalPoolT {
- SignalPoolT() {}
- SignalPoolT(const SignalPoolT &) = delete;
- SignalPoolT(SignalPoolT &&) = delete;
- ~SignalPoolT() {
- size_t N = state.size();
- for (size_t i = 0; i < N; i++) {
- hsa_signal_t signal = state.front();
- state.pop();
- hsa_status_t rc = hsa_signal_destroy(signal);
- if (rc != HSA_STATUS_SUCCESS) {
- DP("Signal pool destruction failed\n");
- }
- }
- }
- size_t size() {
- lock l(&mutex);
- return state.size();
- }
- void push(hsa_signal_t s) {
- lock l(&mutex);
- state.push(s);
- }
- hsa_signal_t pop(void) {
- lock l(&mutex);
- if (!state.empty()) {
- hsa_signal_t res = state.front();
- state.pop();
- return res;
- }
-
- // Pool empty, attempt to create another signal
- hsa_signal_t new_signal;
- hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
- if (err == HSA_STATUS_SUCCESS) {
- return new_signal;
- }
-
- // Fail
- return {0};
- }
-
-private:
- static pthread_mutex_t mutex;
- std::queue<hsa_signal_t> state;
- struct lock {
- lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
- ~lock() { pthread_mutex_unlock(m); }
- pthread_mutex_t *m;
- };
-};
-
-namespace core {
-hsa_status_t atl_init_gpu_context();
-
-hsa_status_t init_hsa();
-hsa_status_t finalize_hsa();
-/*
- * Generic utils
- */
-template <typename T> inline T alignDown(T value, size_t alignment) {
- return (T)(value & ~(alignment - 1));
-}
-
-template <typename T> inline T *alignDown(T *value, size_t alignment) {
- return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
-}
-
-template <typename T> inline T alignUp(T value, size_t alignment) {
- return alignDown((T)(value + alignment - 1), alignment);
-}
-
-template <typename T> inline T *alignUp(T *value, size_t alignment) {
- return reinterpret_cast<T *>(
- alignDown((intptr_t)(value + alignment - 1), alignment));
-}
-
-extern bool atl_is_impl_initialized();
-
-bool handle_group_signal(hsa_signal_value_t value, void *arg);
-
-hsa_status_t allow_access_to_all_gpu_agents(void *ptr);
-} // namespace core
-
-inline const char *get_error_string(hsa_status_t err) {
- const char *res;
- hsa_status_t rc = hsa_status_string(err, &res);
- return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN.";
-}
-
-#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
deleted file mode 100644
index f10057d98d21bf..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.cpp ------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "interop_hsa.h"
-#include "internal.h"
-
-hsa_status_t interop_hsa_get_symbol_info(
- const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
- int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) {
- /*
- // Typical usage:
- void *var_addr;
- size_t var_size;
- interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
- &var_size);
- impl_memcpy(signal, host_add, var_addr, var_size);
- */
-
- if (!symbol || !var_addr || !var_size)
- return HSA_STATUS_ERROR;
-
- // get the symbol info
- std::string symbolStr = std::string(symbol);
- auto It = SymbolInfoTable.find(symbolStr);
- if (It != SymbolInfoTable.end()) {
- atl_symbol_info_t info = It->second;
- *var_addr = reinterpret_cast<void *>(info.addr);
- *var_size = info.size;
- return HSA_STATUS_SUCCESS;
- } else {
- *var_addr = NULL;
- *var_size = 0;
- return HSA_STATUS_ERROR;
- }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
deleted file mode 100644
index 488a99e23c385f..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.h -------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_INTEROP_HSA_H_
-#define INCLUDE_INTEROP_HSA_H_
-
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-
-#include <map>
-#include <string>
-
-extern "C" {
-
-hsa_status_t interop_hsa_get_symbol_info(
- const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
- int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size);
-
-}
-
-#endif // INCLUDE_INTEROP_HSA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
deleted file mode 100644
index 5305040da3bb11..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <string>
-
-#include "msgpack.h"
-
-namespace msgpack {
-
-[[noreturn]] void internal_error() {
- printf("internal error\n");
- exit(1);
-}
-
-const char *type_name(type ty) {
- switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
- case NAME: \
- return #NAME;
-#include "msgpack.def"
-#undef X
- }
- internal_error();
-}
-
-unsigned bytes_used_fixed(msgpack::type ty) {
- using namespace msgpack;
- switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
- case NAME: \
- return WIDTH;
-#include "msgpack.def"
-#undef X
- }
- internal_error();
-}
-
-msgpack::type parse_type(unsigned char x) {
-
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
- if (x >= LOWER && x <= UPPER) { \
- return NAME; \
- } else
-#include "msgpack.def"
-#undef X
- { internal_error(); }
-}
-
-template <typename T, typename R> R bitcast(T x) {
- static_assert(sizeof(T) == sizeof(R), "");
- R tmp;
- memcpy(&tmp, &x, sizeof(T));
- return tmp;
-}
-template int64_t bitcast<uint64_t, int64_t>(uint64_t);
-} // namespace msgpack
-
-// Helper functions for reading additional payload from the header
-// Depending on the type, this can be a number of bytes, elements,
-// key-value pairs or an embedded integer.
-// Each takes a pointer to the start of the header and returns a uint64_t
-
-namespace {
-namespace payload {
-uint64_t read_zero(const unsigned char *) { return 0; }
-
-// Read the first byte and zero/sign extend it
-uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
-uint64_t read_embedded_s8(const unsigned char *start) {
- int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
- return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-
-// Read a masked part of the first byte
-uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
-uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
-uint64_t read_via_mask_0x1f(const unsigned char *start) {
- return *start & 0x1fu;
-}
-
-// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
-// Big endian format.
-uint64_t read_size_field_u8(const unsigned char *from) {
- from++;
- return from[0];
-}
-
-// TODO: detect whether host is little endian or not, and whether the intrinsic
-// is available. And probably use the builtin to test the diy
-const bool use_bswap = false;
-
-uint64_t read_size_field_u16(const unsigned char *from) {
- from++;
- if (use_bswap) {
- uint16_t b;
- memcpy(&b, from, 2);
- return __builtin_bswap16(b);
- } else {
- return (from[0] << 8u) | from[1];
- }
-}
-uint64_t read_size_field_u32(const unsigned char *from) {
- from++;
- if (use_bswap) {
- uint32_t b;
- memcpy(&b, from, 4);
- return __builtin_bswap32(b);
- } else {
- return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
- (from[3] << 0u);
- }
-}
-uint64_t read_size_field_u64(const unsigned char *from) {
- from++;
- if (use_bswap) {
- uint64_t b;
- memcpy(&b, from, 8);
- return __builtin_bswap64(b);
- } else {
- return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
- ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
- (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
- (from[7] << 0u);
- }
-}
-
-uint64_t read_size_field_s8(const unsigned char *from) {
- uint8_t u = read_size_field_u8(from);
- int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
- return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s16(const unsigned char *from) {
- uint16_t u = read_size_field_u16(from);
- int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
- return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s32(const unsigned char *from) {
- uint32_t u = read_size_field_u32(from);
- int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
- return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s64(const unsigned char *from) {
- uint64_t u = read_size_field_u64(from);
- int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
- return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-} // namespace payload
-} // namespace
-
-namespace msgpack {
-
-payload_info_t payload_info(msgpack::type ty) {
- using namespace msgpack;
- switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
- case NAME: \
- return payload::PAYLOAD;
-#include "msgpack.def"
-#undef X
- }
- internal_error();
-}
-
-} // namespace msgpack
-
-const unsigned char *msgpack::skip_next_message(const unsigned char *start,
- const unsigned char *end) {
- class f : public functors_defaults<f> {};
- return handle_msgpack({start, end}, f());
-}
-
-namespace msgpack {
-bool message_is_string(byte_range bytes, const char *needle) {
- bool matched = false;
- size_t needleN = strlen(needle);
-
- foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
- if (N == needleN) {
- if (memcmp(needle, str, N) == 0) {
- matched = true;
- }
- }
- });
- return matched;
-}
-
-void dump(byte_range bytes) {
- struct inner : functors_defaults<inner> {
- inner(unsigned indent) : indent(indent) {}
- const unsigned by = 2;
- unsigned indent = 0;
-
- void handle_string(size_t N, const unsigned char *bytes) {
- char *tmp = (char *)malloc(N + 1);
- memcpy(tmp, bytes, N);
- tmp[N] = '\0';
- printf("\"%s\"", tmp);
- free(tmp);
- }
-
- void handle_signed(int64_t x) { printf("%ld", x); }
- void handle_unsigned(uint64_t x) { printf("%lu", x); }
-
- const unsigned char *handle_array(uint64_t N, byte_range bytes) {
- printf("\n%*s[\n", indent, "");
- indent += by;
-
- for (uint64_t i = 0; i < N; i++) {
- indent += by;
- printf("%*s", indent, "");
- const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
- printf(",\n");
- indent -= by;
- bytes.start = next;
- if (!next) {
- break;
- }
- }
- indent -= by;
- printf("%*s]", indent, "");
-
- return bytes.start;
- }
-
- const unsigned char *handle_map(uint64_t N, byte_range bytes) {
- printf("\n%*s{\n", indent, "");
- indent += by;
-
- for (uint64_t i = 0; i < 2 * N; i += 2) {
- const unsigned char *start_key = bytes.start;
- printf("%*s", indent, "");
- const unsigned char *end_key =
- handle_msgpack<inner>({start_key, bytes.end}, {indent});
- if (!end_key) {
- break;
- }
-
- printf(" : ");
-
- const unsigned char *start_value = end_key;
- const unsigned char *end_value =
- handle_msgpack<inner>({start_value, bytes.end}, {indent});
-
- if (!end_value) {
- break;
- }
-
- printf(",\n");
- bytes.start = end_value;
- }
-
- indent -= by;
- printf("%*s}", indent, "");
-
- return bytes.start;
- }
- };
-
- handle_msgpack<inner>(bytes, {0});
- printf("\n");
-}
-
-} // namespace msgpack
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
deleted file mode 100644
index baef0594422396..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
+++ /dev/null
@@ -1,46 +0,0 @@
-//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// name, header width, reader, [lower, upper] encoding
-X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
-X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
-X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
-X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
-X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
-X(nil, 1, read_zero, 0xc0, 0xc0)
-X(never_used, 1, read_zero, 0xc1, 0xc1)
-X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
-X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
-X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
-X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
-X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
-X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
-X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
-X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
-X(float32, 5, read_zero, 0xca, 0xca)
-X(float64, 9, read_zero, 0xcb, 0xcb)
-X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
-X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
-X(uint32, 5, read_size_field_u32, 0xce, 0xce)
-X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
-X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
-X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
-X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
-X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
-X(fixext1, 3, read_zero, 0xd4, 0xd4)
-X(fixext2, 4, read_zero, 0xd5, 0xd5)
-X(fixext4, 6, read_zero, 0xd6, 0xd6)
-X(fixext8, 10, read_zero, 0xd7, 0xd7)
-X(fixext16, 18, read_zero, 0xd8, 0xd8)
-X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
-X(str16, 3, read_size_field_u16, 0xda, 0xda)
-X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
-X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
-X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
-X(map16, 3, read_size_field_u16, 0xde, 0xde)
-X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
deleted file mode 100644
index 6a4b7eabeb8252..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
+++ /dev/null
@@ -1,282 +0,0 @@
-//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MSGPACK_H
-#define MSGPACK_H
-
-#include <functional>
-
-namespace msgpack {
-
-// The message pack format is dynamically typed, schema-less. Format is:
-// message: [type][header][payload]
-// where type is one byte, header length is a fixed length function of type
-// payload is zero to N bytes, with the length encoded in [type][header]
-
-// Scalar fields include boolean, signed integer, float, string etc
-// Composite types are sequences of messages
-// Array field is [header][element][element]...
-// Map field is [header][key][value][key][value]...
-
-// Multibyte integer fields are big endian encoded
-// The map key can be any message type
-// Maps may contain duplicate keys
-// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
-// in as many as nine, as signed or unsigned. Implementation defined.
-// Similarly "foo" may embed the length in the type field or in multiple bytes
-
-// This parser is structured as an iterator over a sequence of bytes.
-// It calls a user provided function on each message in order to extract fields
-// The default implementation for each scalar type is to do nothing. For map or
-// arrays, the default implementation returns just after that message to support
-// iterating to the next message, but otherwise has no effect.
-
-struct byte_range {
- const unsigned char *start;
- const unsigned char *end;
-};
-
-const unsigned char *skip_next_message(const unsigned char *start,
- const unsigned char *end);
-
-template <typename Derived> class functors_defaults {
-public:
- void cb_string(size_t N, const unsigned char *str) {
- derived().handle_string(N, str);
- }
- void cb_boolean(bool x) { derived().handle_boolean(x); }
- void cb_signed(int64_t x) { derived().handle_signed(x); }
- void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
- void cb_array_elements(byte_range bytes) {
- derived().handle_array_elements(bytes);
- }
- void cb_map_elements(byte_range key, byte_range value) {
- derived().handle_map_elements(key, value);
- }
- const unsigned char *cb_array(uint64_t N, byte_range bytes) {
- return derived().handle_array(N, bytes);
- }
- const unsigned char *cb_map(uint64_t N, byte_range bytes) {
- return derived().handle_map(N, bytes);
- }
-
-private:
- Derived &derived() { return *static_cast<Derived *>(this); }
-
- // Default implementations for scalar ops are no-ops
- void handle_string(size_t, const unsigned char *) {}
- void handle_boolean(bool) {}
- void handle_signed(int64_t) {}
- void handle_unsigned(uint64_t) {}
- void handle_array_elements(byte_range) {}
- void handle_map_elements(byte_range, byte_range) {}
-
- // Default implementation for sequences is to skip over the messages
- const unsigned char *handle_array(uint64_t N, byte_range bytes) {
- for (uint64_t i = 0; i < N; i++) {
- const unsigned char *next = skip_next_message(bytes.start, bytes.end);
- if (!next) {
- return nullptr;
- }
- cb_array_elements(bytes);
- bytes.start = next;
- }
- return bytes.start;
- }
- const unsigned char *handle_map(uint64_t N, byte_range bytes) {
- for (uint64_t i = 0; i < N; i++) {
- const unsigned char *start_key = bytes.start;
- const unsigned char *end_key = skip_next_message(start_key, bytes.end);
- if (!end_key) {
- return nullptr;
- }
- const unsigned char *start_value = end_key;
- const unsigned char *end_value =
- skip_next_message(start_value, bytes.end);
- if (!end_value) {
- return nullptr;
- }
- cb_map_elements({start_key, end_key}, {start_value, end_value});
- bytes.start = end_value;
- }
- return bytes.start;
- }
-};
-
-typedef enum : uint8_t {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
-#include "msgpack.def"
-#undef X
-} type;
-
-[[noreturn]] void internal_error();
-type parse_type(unsigned char x);
-unsigned bytes_used_fixed(type ty);
-
-typedef uint64_t (*payload_info_t)(const unsigned char *);
-payload_info_t payload_info(msgpack::type ty);
-
-template <typename T, typename R> R bitcast(T x);
-
-template <typename F, msgpack::type ty>
-const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
- const unsigned char *start = bytes.start;
- const unsigned char *end = bytes.end;
- const uint64_t available = end - start;
- assert(available != 0);
- assert(ty == parse_type(*start));
-
- const uint64_t bytes_used = bytes_used_fixed(ty);
- if (available < bytes_used) {
- return 0;
- }
- const uint64_t available_post_header = available - bytes_used;
-
- const payload_info_t info = payload_info(ty);
- const uint64_t N = info(start);
-
- switch (ty) {
- case msgpack::t:
- case msgpack::f: {
- // t is 0b11000010, f is 0b11000011, masked with 0x1
- f.cb_boolean(N);
- return start + bytes_used;
- }
-
- case msgpack::posfixint:
- case msgpack::uint8:
- case msgpack::uint16:
- case msgpack::uint32:
- case msgpack::uint64: {
- f.cb_unsigned(N);
- return start + bytes_used;
- }
-
- case msgpack::negfixint:
- case msgpack::int8:
- case msgpack::int16:
- case msgpack::int32:
- case msgpack::int64: {
- f.cb_signed(bitcast<uint64_t, int64_t>(N));
- return start + bytes_used;
- }
-
- case msgpack::fixstr:
- case msgpack::str8:
- case msgpack::str16:
- case msgpack::str32: {
- if (available_post_header < N) {
- return 0;
- } else {
- f.cb_string(N, start + bytes_used);
- return start + bytes_used + N;
- }
- }
-
- case msgpack::fixarray:
- case msgpack::array16:
- case msgpack::array32: {
- return f.cb_array(N, {start + bytes_used, end});
- }
-
- case msgpack::fixmap:
- case msgpack::map16:
- case msgpack::map32: {
- return f.cb_map(N, {start + bytes_used, end});
- }
-
- case msgpack::nil:
- case msgpack::bin8:
- case msgpack::bin16:
- case msgpack::bin32:
- case msgpack::float32:
- case msgpack::float64:
- case msgpack::ext8:
- case msgpack::ext16:
- case msgpack::ext32:
- case msgpack::fixext1:
- case msgpack::fixext2:
- case msgpack::fixext4:
- case msgpack::fixext8:
- case msgpack::fixext16:
- case msgpack::never_used: {
- if (available_post_header < N) {
- return 0;
- }
- return start + bytes_used + N;
- }
- }
- internal_error();
-}
-
-template <typename F>
-const unsigned char *handle_msgpack(byte_range bytes, F f) {
- const unsigned char *start = bytes.start;
- const unsigned char *end = bytes.end;
- const uint64_t available = end - start;
- if (available == 0) {
- return 0;
- }
- const type ty = parse_type(*start);
-
- switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \
- case msgpack::NAME: \
- return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
-#include "msgpack.def"
-#undef X
- }
-
- internal_error();
-}
-
-bool message_is_string(byte_range bytes, const char *str);
-
-template <typename C> void foronly_string(byte_range bytes, C callback) {
- struct inner : functors_defaults<inner> {
- inner(C &cb) : cb(cb) {}
- C &cb;
- void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
- };
- handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
- struct inner : functors_defaults<inner> {
- inner(C &cb) : cb(cb) {}
- C &cb;
- void handle_unsigned(uint64_t x) { cb(x); }
- };
- handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_array(byte_range bytes, C callback) {
- struct inner : functors_defaults<inner> {
- inner(C &cb) : cb(cb) {}
- C &cb;
- void handle_array_elements(byte_range element) { cb(element); }
- };
- handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_map(byte_range bytes, C callback) {
- struct inner : functors_defaults<inner> {
- inner(C &cb) : cb(cb) {}
- C &cb;
- void handle_map_elements(byte_range key, byte_range value) {
- cb(key, value);
- }
- };
- handle_msgpack<inner>(bytes, {callback});
-}
-
-// Crude approximation to json
-void dump(byte_range);
-
-} // namespace msgpack
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
deleted file mode 100644
index 8b09a8466cccbd..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/rt.h ----------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_RT_H_
-#define SRC_RUNTIME_INCLUDE_RT_H_
-
-#include "hsa_api.h"
-#include "impl_runtime.h"
-#include "internal.h"
-
-#include <string>
-
-namespace core {
-namespace Runtime {
-hsa_status_t Memfree(void *);
-hsa_status_t HostMalloc(void **ptr, size_t size,
- hsa_amd_memory_pool_t MemoryPool);
-
-} // namespace Runtime
-hsa_status_t RegisterModuleFromMemory(
- std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
- std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
- void *module_bytes, size_t module_size, hsa_agent_t agent,
- hsa_status_t (*on_deserialized_data)(void *data, size_t size,
- void *cb_state),
- void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
-
-} // namespace core
-
-#endif // SRC_RUNTIME_INCLUDE_RT_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
deleted file mode 100644
index a678bad55e2d14..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-//===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <cassert>
-#include <sstream>
-#include <string>
-
-#include "internal.h"
-#include "rt.h"
-
-#include "msgpack.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-
-namespace hsa {
-// Wrap HSA iterate API in a shim that allows passing general callables
-template <typename C>
-hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
- auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
- void *data) -> hsa_status_t {
- C *unwrapped = static_cast<C *>(data);
- return (*unwrapped)(executable, symbol);
- };
- return hsa_executable_iterate_symbols(executable, L,
- static_cast<void *>(&cb));
-}
-} // namespace hsa
-
-typedef unsigned char *address;
-/*
- * Note descriptors.
- */
-// FreeBSD already declares Elf_Note (indirectly via <libelf.h>)
-#if !defined(__FreeBSD__)
-typedef struct {
- uint32_t n_namesz; /* Length of note's name. */
- uint32_t n_descsz; /* Length of note's value. */
- uint32_t n_type; /* Type of note. */
- // then name
- // then padding, optional
- // then desc, at 4 byte alignment (not 8, despite being elf64)
-} Elf_Note;
-#endif
-
-class KernelArgMD {
-public:
- enum class ValueKind {
- HiddenGlobalOffsetX,
- HiddenGlobalOffsetY,
- HiddenGlobalOffsetZ,
- HiddenNone,
- HiddenPrintfBuffer,
- HiddenDefaultQueue,
- HiddenCompletionAction,
- HiddenMultiGridSyncArg,
- HiddenHostcallBuffer,
- HiddenHeapV1,
- Unknown
- };
-
- KernelArgMD()
- : name_(std::string()), size_(0), offset_(0),
- valueKind_(ValueKind::Unknown) {}
-
- // fields
- std::string name_;
- uint32_t size_;
- uint32_t offset_;
- ValueKind valueKind_;
-};
-
-static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
- // v3
- // {"by_value", KernelArgMD::ValueKind::ByValue},
- // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
- // {"dynamic_shared_pointer",
- // KernelArgMD::ValueKind::DynamicSharedPointer},
- // {"sampler", KernelArgMD::ValueKind::Sampler},
- // {"image", KernelArgMD::ValueKind::Image},
- // {"pipe", KernelArgMD::ValueKind::Pipe},
- // {"queue", KernelArgMD::ValueKind::Queue},
- {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
- {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
- {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
- {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
- {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
- {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
- {"hidden_completion_action",
- KernelArgMD::ValueKind::HiddenCompletionAction},
- {"hidden_multigrid_sync_arg",
- KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
- {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
- {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
-
-namespace core {
-
-hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
- if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
- hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
- // memory_fault.agent
- // memory_fault.virtual_address
- // memory_fault.fault_reason_mask
- // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
- std::stringstream stream;
- stream << std::hex << (uintptr_t)memory_fault.virtual_address;
- std::string addr("0x" + stream.str());
-
- std::string err_string = "[GPU Memory Error] Addr: " + addr;
- err_string += " Reason: ";
- if (!(memory_fault.fault_reason_mask & 0x00111111)) {
- err_string += "No Idea! ";
- } else {
- if (memory_fault.fault_reason_mask & 0x00000001)
- err_string += "Page not present or supervisor privilege. ";
- if (memory_fault.fault_reason_mask & 0x00000010)
- err_string += "Write access to a read-only page. ";
- if (memory_fault.fault_reason_mask & 0x00000100)
- err_string += "Execute access to a page marked NX. ";
- if (memory_fault.fault_reason_mask & 0x00001000)
- err_string += "Host access only. ";
- if (memory_fault.fault_reason_mask & 0x00010000)
- err_string += "ECC failure (if supported by HW). ";
- if (memory_fault.fault_reason_mask & 0x00100000)
- err_string += "Can't determine the exact fault address. ";
- }
- fprintf(stderr, "%s\n", err_string.c_str());
- return HSA_STATUS_ERROR;
- }
- return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t atl_init_gpu_context() {
- hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Registering the system for memory faults", get_error_string(err));
- return HSA_STATUS_ERROR;
- }
-
- return HSA_STATUS_SUCCESS;
-}
-
-static bool isImplicit(KernelArgMD::ValueKind value_kind) {
- switch (value_kind) {
- case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
- case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
- case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
- case KernelArgMD::ValueKind::HiddenNone:
- case KernelArgMD::ValueKind::HiddenPrintfBuffer:
- case KernelArgMD::ValueKind::HiddenDefaultQueue:
- case KernelArgMD::ValueKind::HiddenCompletionAction:
- case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
- case KernelArgMD::ValueKind::HiddenHostcallBuffer:
- case KernelArgMD::ValueKind::HiddenHeapV1:
- return true;
- default:
- return false;
- }
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-findMetadata(const ELFObjectFile<ELF64LE> &ELFObj) {
- constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
- nullptr, nullptr};
- const auto &Elf = ELFObj.getELFFile();
- auto PhdrsOrErr = Elf.program_headers();
- if (!PhdrsOrErr) {
- consumeError(PhdrsOrErr.takeError());
- return Failure;
- }
-
- for (auto Phdr : *PhdrsOrErr) {
- if (Phdr.p_type != PT_NOTE)
- continue;
-
- Error Err = Error::success();
- for (auto Note : Elf.notes(Phdr, Err)) {
- if (Note.getType() == 7 || Note.getType() == 8)
- return Failure;
-
- // Code object v2 uses yaml metadata and is no longer supported.
- if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD")
- return Failure;
- // Code object v3 should have AMDGPU metadata.
- if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU")
- return Failure;
-
- ArrayRef<uint8_t> Desc = Note.getDesc(Phdr.p_align);
- return {Desc.data(), Desc.data() + Desc.size()};
- }
-
- if (Err) {
- consumeError(std::move(Err));
- return Failure;
- }
- }
-
- return Failure;
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-find_metadata(void *binary, size_t binSize) {
- constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
- nullptr, nullptr};
-
- StringRef Buffer = StringRef(static_cast<const char *>(binary), binSize);
- auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
- /*InitContent=*/false);
- if (!ElfOrErr) {
- consumeError(ElfOrErr.takeError());
- return Failure;
- }
-
- if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
- return findMetadata(*ELFObj);
- return Failure;
-}
-
-namespace {
-int map_lookup_array(msgpack::byte_range message, const char *needle,
- msgpack::byte_range *res, uint64_t *size) {
- unsigned count = 0;
- struct s : msgpack::functors_defaults<s> {
- s(unsigned &count, uint64_t *size) : count(count), size(size) {}
- unsigned &count;
- uint64_t *size;
- const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
- count++;
- *size = N;
- return bytes.end;
- }
- };
-
- msgpack::foreach_map(message,
- [&](msgpack::byte_range key, msgpack::byte_range value) {
- if (msgpack::message_is_string(key, needle)) {
- // If the message is an array, record number of
- // elements in *size
- msgpack::handle_msgpack<s>(value, {count, size});
- // return the whole array
- *res = value;
- }
- });
- // Only claim success if exactly one key/array pair matched
- return count != 1;
-}
-
-int map_lookup_string(msgpack::byte_range message, const char *needle,
- std::string *res) {
- unsigned count = 0;
- struct s : public msgpack::functors_defaults<s> {
- s(unsigned &count, std::string *res) : count(count), res(res) {}
- unsigned &count;
- std::string *res;
- void handle_string(size_t N, const unsigned char *str) {
- count++;
- *res = std::string(str, str + N);
- }
- };
- msgpack::foreach_map(message,
- [&](msgpack::byte_range key, msgpack::byte_range value) {
- if (msgpack::message_is_string(key, needle)) {
- msgpack::handle_msgpack<s>(value, {count, res});
- }
- });
- return count != 1;
-}
-
-int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
- uint64_t *res) {
- unsigned count = 0;
- msgpack::foreach_map(message,
- [&](msgpack::byte_range key, msgpack::byte_range value) {
- if (msgpack::message_is_string(key, needle)) {
- msgpack::foronly_unsigned(value, [&](uint64_t x) {
- count++;
- *res = x;
- });
- }
- });
- return count != 1;
-}
-
-int array_lookup_element(msgpack::byte_range message, uint64_t elt,
- msgpack::byte_range *res) {
- int rc = 1;
- uint64_t i = 0;
- msgpack::foreach_array(message, [&](msgpack::byte_range value) {
- if (i == elt) {
- *res = value;
- rc = 0;
- }
- i++;
- });
- return rc;
-}
-
-int populate_kernelArgMD(msgpack::byte_range args_element,
- KernelArgMD *kernelarg) {
- using namespace msgpack;
- int error = 0;
- foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
- if (message_is_string(key, ".name")) {
- foronly_string(value, [&](size_t N, const unsigned char *str) {
- kernelarg->name_ = std::string(str, str + N);
- });
- } else if (message_is_string(key, ".size")) {
- foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
- } else if (message_is_string(key, ".offset")) {
- foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
- } else if (message_is_string(key, ".value_kind")) {
- foronly_string(value, [&](size_t N, const unsigned char *str) {
- std::string s = std::string(str, str + N);
- auto itValueKind = ArgValueKind.find(s);
- if (itValueKind != ArgValueKind.end()) {
- kernelarg->valueKind_ = itValueKind->second;
- }
- });
- }
- });
- return error;
-}
-} // namespace
-
-static hsa_status_t get_code_object_custom_metadata(
- void *binary, size_t binSize,
- std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
- // parse code object with
diff erent keys from v2
- // also, the kernel name is not the same as the symbol name -- so a
- // symbol->name map is needed
-
- std::pair<const unsigned char *, const unsigned char *> metadata =
- find_metadata(binary, binSize);
- if (!metadata.first) {
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- uint64_t kernelsSize = 0;
- int msgpack_errors = 0;
- msgpack::byte_range kernel_array;
- msgpack_errors =
- map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
- &kernel_array, &kernelsSize);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "kernels lookup in program metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- for (size_t i = 0; i < kernelsSize; i++) {
- assert(msgpack_errors == 0);
- std::string kernelName;
- std::string symbolName;
-
- msgpack::byte_range element;
- msgpack_errors += array_lookup_element(kernel_array, i, &element);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "element lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- msgpack_errors += map_lookup_string(element, ".name", &kernelName);
- msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "strings lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- // Make sure that kernelName + ".kd" == symbolName
- if ((kernelName + ".kd") != symbolName) {
- printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
- __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
- msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "sgpr count metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- info.sgpr_count = sgpr_count;
-
- msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "vgpr count metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- info.vgpr_count = vgpr_count;
-
- msgpack_errors +=
- map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "sgpr spill count metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- info.sgpr_spill_count = sgpr_spill_count;
-
- msgpack_errors +=
- map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "vgpr spill count metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- info.vgpr_spill_count = vgpr_spill_count;
-
- size_t kernel_explicit_args_size = 0;
- uint64_t kernel_segment_size;
- msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
- &kernel_segment_size);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "kernarg segment size metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- bool hasHiddenArgs = false;
- if (kernel_segment_size > 0) {
- uint64_t argsSize;
- size_t offset = 0;
-
- msgpack::byte_range args_array;
- msgpack_errors +=
- map_lookup_array(element, ".args", &args_array, &argsSize);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "kernel args metadata lookup in kernel metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- for (size_t i = 0; i < argsSize; ++i) {
- KernelArgMD lcArg;
-
- msgpack::byte_range args_element;
- msgpack_errors += array_lookup_element(args_array, i, &args_element);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "iterate args map in kernel args metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
-
- msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
- if (msgpack_errors != 0) {
- printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
- "iterate args map in kernel args metadata");
- return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
- }
- // v3 has offset field and not align field
- size_t new_offset = lcArg.offset_;
- size_t padding = new_offset - offset;
- offset = new_offset;
- DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
- lcArg.offset_);
- offset += lcArg.size_;
-
- // check if the arg is a hidden/implicit arg
- // this logic assumes that all hidden args are 8-byte aligned
- if (!isImplicit(lcArg.valueKind_)) {
- info.explicit_argument_count++;
- kernel_explicit_args_size += lcArg.size_;
- } else {
- info.implicit_argument_count++;
- hasHiddenArgs = true;
- }
- kernel_explicit_args_size += padding;
- }
- }
-
- // TODO: Probably don't want this arithmetic
- info.kernel_segment_size =
- (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
- DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
- kernel_segment_size, info.kernel_segment_size);
-
- // kernel received, now add it to the kernel info table
- KernelInfoTable[kernelName] = info;
- }
-
- return HSA_STATUS_SUCCESS;
-}
-
-static hsa_status_t
-populate_InfoTables(hsa_executable_symbol_t symbol,
- std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
- std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
- hsa_symbol_kind_t type;
-
- uint32_t name_length;
- hsa_status_t err;
- err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
- &type);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info extraction", get_error_string(err));
- return err;
- }
- DP("Exec Symbol type: %d\n", type);
- if (type == HSA_SYMBOL_KIND_KERNEL) {
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info extraction", get_error_string(err));
- return err;
- }
- char *name = reinterpret_cast<char *>(malloc(name_length + 1));
- err = hsa_executable_symbol_get_info(symbol,
- HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info extraction", get_error_string(err));
- return err;
- }
- // remove the suffix .kd from symbol name.
- name[name_length - 3] = 0;
-
- atl_kernel_info_t info;
- std::string kernelName(name);
- // by now, the kernel info table should already have an entry
- // because the non-ROCr custom code object parsing is called before
- // iterating over the code object symbols using ROCr
- if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
- DP("amdgpu internal consistency error\n");
- return HSA_STATUS_ERROR;
- }
- // found, so assign and update
- info = KernelInfoTable[kernelName];
-
- /* Extract dispatch information from the symbol */
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
- &(info.kernel_object));
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Extracting the symbol from the executable",
- get_error_string(err));
- return err;
- }
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
- &(info.group_segment_size));
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Extracting the group segment size from the executable",
- get_error_string(err));
- return err;
- }
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
- &(info.private_segment_size));
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Extracting the private segment from the executable",
- get_error_string(err));
- return err;
- }
-
- DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
- "kernarg\n",
- kernelName.c_str(), info.kernel_object, info.group_segment_size,
- info.private_segment_size, info.kernel_segment_size);
-
- // assign it back to the kernel info table
- KernelInfoTable[kernelName] = info;
- free(name);
- } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info extraction", get_error_string(err));
- return err;
- }
- char *name = reinterpret_cast<char *>(malloc(name_length + 1));
- err = hsa_executable_symbol_get_info(symbol,
- HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info extraction", get_error_string(err));
- return err;
- }
- name[name_length] = 0;
-
- atl_symbol_info_t info;
-
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info address extraction", get_error_string(err));
- return err;
- }
-
- err = hsa_executable_symbol_get_info(
- symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Symbol info size extraction", get_error_string(err));
- return err;
- }
-
- DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size);
- SymbolInfoTable[std::string(name)] = info;
- free(name);
- } else {
- DP("Symbol is an indirect function\n");
- }
- return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t RegisterModuleFromMemory(
- std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
- std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
- void *module_bytes, size_t module_size, hsa_agent_t agent,
- hsa_status_t (*on_deserialized_data)(void *data, size_t size,
- void *cb_state),
- void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
- hsa_status_t err;
- hsa_executable_t executable = {0};
- hsa_profile_t agent_profile;
-
- err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Query the agent profile", get_error_string(err));
- return HSA_STATUS_ERROR;
- }
- // FIXME: Assume that every profile is FULL until we understand how to build
- // GCN with base profile
- agent_profile = HSA_PROFILE_FULL;
- /* Create the empty executable. */
- err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
- &executable);
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Create the executable", get_error_string(err));
- return HSA_STATUS_ERROR;
- }
-
- bool module_load_success = false;
- do // Existing control flow used continue, preserve that for this patch
- {
- {
- // Some metadata info is not available through ROCr API, so use custom
- // code object metadata parsing to collect such metadata info
-
- err = get_code_object_custom_metadata(module_bytes, module_size,
- KernelInfoTable);
- if (err != HSA_STATUS_SUCCESS) {
- DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Getting custom code object metadata", get_error_string(err));
- continue;
- }
-
- // Deserialize code object.
- hsa_code_object_t code_object = {0};
- err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
- &code_object);
- if (err != HSA_STATUS_SUCCESS) {
- DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Code Object Deserialization", get_error_string(err));
- continue;
- }
- assert(0 != code_object.handle);
-
- // Mutating the device image here avoids another allocation & memcpy
- void *code_object_alloc_data =
- reinterpret_cast<void *>(code_object.handle);
- hsa_status_t impl_err =
- on_deserialized_data(code_object_alloc_data, module_size, cb_state);
- if (impl_err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Error in deserialized_data callback",
- get_error_string(impl_err));
- return impl_err;
- }
-
- /* Load the code object. */
- err =
- hsa_executable_load_code_object(executable, agent, code_object, NULL);
- if (err != HSA_STATUS_SUCCESS) {
- DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Loading the code object", get_error_string(err));
- continue;
- }
-
- // cannot iterate over symbols until executable is frozen
- }
- module_load_success = true;
- } while (0);
- DP("Modules loaded successful? %d\n", module_load_success);
- if (module_load_success) {
- /* Freeze the executable; it can now be queried for symbols. */
- err = hsa_executable_freeze(executable, "");
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Freeze the executable", get_error_string(err));
- return HSA_STATUS_ERROR;
- }
-
- err = hsa::executable_iterate_symbols(
- executable,
- [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
- return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable);
- });
- if (err != HSA_STATUS_SUCCESS) {
- printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Iterating over symbols for execuatable", get_error_string(err));
- return HSA_STATUS_ERROR;
- }
-
- // save the executable and destroy during finalize
- HSAExecutables.push_back(executable);
- return HSA_STATUS_SUCCESS;
- } else {
- return HSA_STATUS_ERROR;
- }
-}
-
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
deleted file mode 100644
index 106884e4f432b6..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
+++ /dev/null
@@ -1,20 +0,0 @@
-//===--- amdgpu/src/print_tracing.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-
-enum PrintTraceControlBits {
- LAUNCH = 1, // print a message to stderr for each kernel launch
- RTL_TIMING = 2, // Print timing info around each RTL step
- STARTUP_DETAILS = 4, // Details around loading up kernel
- RTL_TO_STDOUT = 8 // Redirect RTL tracing to stdout
-};
-
-extern int print_kernel_trace; // set by environment variable
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
deleted file mode 100644
index d25a520326b9a0..00000000000000
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ /dev/null
@@ -1,2615 +0,0 @@
-//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for AMD hsa machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <algorithm>
-#include <assert.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "ELFSymbols.h"
-#include "impl_runtime.h"
-#include "interop_hsa.h"
-
-#include "UtilitiesRTL.h"
-#include "internal.h"
-#include "rt.h"
-
-#include "DeviceEnvironment.h"
-#include "get_elf_mach_gfx_name.h"
-#include "omptargetplugin.h"
-#include "print_tracing.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-using namespace llvm::omp::target::plugin::utils;
-
-// hostrpc interface, FIXME: consider moving to its own include these are
-// statically linked into amdgpu/plugin if present from hostrpc_services.a,
-// linked as --whole-archive to override the weak symbols that are used to
-// implement a fallback for toolchains that do not yet have a hostrpc library.
-extern "C" {
-uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
- uint32_t DeviceId);
-hsa_status_t hostrpc_init();
-hsa_status_t hostrpc_terminate();
-
-__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
-__attribute__((weak)) hsa_status_t hostrpc_terminate() {
- return HSA_STATUS_SUCCESS;
-}
-__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
- uint32_t DeviceId) {
- DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
- "missing\n",
- DeviceId);
- return 0;
-}
-}
-
-// Heuristic parameters used for kernel launch
-// Number of teams per CU to allow scheduling flexibility
-static const unsigned DefaultTeamsPerCU = 4;
-
-int print_kernel_trace;
-
-#ifdef OMPTARGET_DEBUG
-#define check(msg, status) \
- if (status != HSA_STATUS_SUCCESS) { \
- DP(#msg " failed\n"); \
- } else { \
- DP(#msg " succeeded\n"); \
- }
-#else
-#define check(msg, status) \
- {}
-#endif
-
-#include "elf_common.h"
-
-namespace hsa {
-template <typename C> hsa_status_t iterate_agents(C Cb) {
- auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
- C *Unwrapped = static_cast<C *>(Data);
- return (*Unwrapped)(Agent);
- };
- return hsa_iterate_agents(L, static_cast<void *>(&Cb));
-}
-
-template <typename C>
-hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
- auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
- C *Unwrapped = static_cast<C *>(Data);
- return (*Unwrapped)(MemoryPool);
- };
-
- return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
-}
-
-} // namespace hsa
-
-/// Keep entries table per device
-struct FuncOrGblEntryTy {
- __tgt_target_table Table;
- std::vector<__tgt_offload_entry> Entries;
-};
-
-struct KernelArgPool {
-private:
- static pthread_mutex_t Mutex;
-
-public:
- uint32_t KernargSegmentSize;
- void *KernargRegion = nullptr;
- std::queue<int> FreeKernargSegments;
-
- uint32_t kernargSizeIncludingImplicit() {
- return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy);
- }
-
- ~KernelArgPool() {
- if (KernargRegion) {
- auto R = hsa_amd_memory_pool_free(KernargRegion);
- if (R != HSA_STATUS_SUCCESS) {
- DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
- }
- }
- }
-
- // Can't really copy or move a mutex
- KernelArgPool() = default;
- KernelArgPool(const KernelArgPool &) = delete;
- KernelArgPool(KernelArgPool &&) = delete;
-
- KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
- : KernargSegmentSize(KernargSegmentSize) {
-
- // impl uses one pool per kernel for all gpus, with a fixed upper size
- // preserving that exact scheme here, including the queue<int>
-
- hsa_status_t Err = hsa_amd_memory_pool_allocate(
- MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
- &KernargRegion);
-
- if (Err != HSA_STATUS_SUCCESS) {
- DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
- KernargRegion = nullptr; // paranoid
- return;
- }
-
- Err = core::allow_access_to_all_gpu_agents(KernargRegion);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
- get_error_string(Err));
- auto R = hsa_amd_memory_pool_free(KernargRegion);
- if (R != HSA_STATUS_SUCCESS) {
- // if free failed, can't do anything more to resolve it
- DP("hsa memory poll free failed: %s\n", get_error_string(Err));
- }
- KernargRegion = nullptr;
- return;
- }
-
- for (int I = 0; I < MAX_NUM_KERNELS; I++) {
- FreeKernargSegments.push(I);
- }
- }
-
- void *allocate(uint64_t ArgNum) {
- assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
- Lock L(&Mutex);
- void *Res = nullptr;
- if (!FreeKernargSegments.empty()) {
-
- int FreeIdx = FreeKernargSegments.front();
- Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
- (FreeIdx * kernargSizeIncludingImplicit()));
- assert(FreeIdx == pointerToIndex(Res));
- FreeKernargSegments.pop();
- }
- return Res;
- }
-
- void deallocate(void *Ptr) {
- Lock L(&Mutex);
- int Idx = pointerToIndex(Ptr);
- FreeKernargSegments.push(Idx);
- }
-
-private:
- int pointerToIndex(void *Ptr) {
- ptr
diff _t Bytes =
- static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
- assert(Bytes >= 0);
- assert(Bytes % kernargSizeIncludingImplicit() == 0);
- return Bytes / kernargSizeIncludingImplicit();
- }
- struct Lock {
- Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
- ~Lock() { pthread_mutex_unlock(M); }
- pthread_mutex_t *M;
- };
-};
-pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
-
-std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
- KernelArgPoolMap;
-
-/// Use a single entity to encode a kernel and a set of flags
-struct KernelTy {
- llvm::omp::OMPTgtExecModeFlags ExecutionMode;
- int16_t ConstWGSize;
- int32_t DeviceId;
- void *CallStackAddr = nullptr;
- const char *Name;
-
- KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
- int32_t DeviceId, void *CallStackAddr, const char *Name,
- uint32_t KernargSegmentSize,
- hsa_amd_memory_pool_t &KernArgMemoryPool)
- : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
- DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
- DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
-
- std::string N(Name);
- if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
- KernelArgPoolMap.insert(
- std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
- KernargSegmentSize, KernArgMemoryPool))));
- }
- }
-};
-
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
-template <typename Callback> static hsa_status_t findAgents(Callback CB) {
-
- hsa_status_t Err =
- hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
- hsa_device_type_t DeviceType;
- // get_info fails iff HSA runtime not yet initialized
- hsa_status_t Err =
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
-
- if (Err != HSA_STATUS_SUCCESS) {
- if (print_kernel_trace > 0)
- DP("rtl.cpp: err %s\n", get_error_string(Err));
-
- return Err;
- }
-
- CB(DeviceType, Agent);
- return HSA_STATUS_SUCCESS;
- });
-
- // iterate_agents fails iff HSA runtime not yet initialized
- if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
- DP("rtl.cpp: err %s\n", get_error_string(Err));
- }
-
- return Err;
-}
-
-static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
- void *Data) {
- if (Status != HSA_STATUS_SUCCESS) {
- const char *StatusString;
- if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
- StatusString = "unavailable";
- }
- DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
- Status, StatusString);
- abort();
- }
-}
-
-namespace core {
-namespace {
-
-bool checkResult(hsa_status_t Err, const char *ErrMsg) {
- if (Err == HSA_STATUS_SUCCESS)
- return true;
-
- REPORT("%s", ErrMsg);
- REPORT("%s", get_error_string(Err));
- return false;
-}
-
-void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
- __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
-}
-
-uint16_t createHeader() {
- uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
- Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
- Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
- return Header;
-}
-
-hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
- bool AllocAllowed = false;
- hsa_status_t Err = hsa_amd_memory_pool_get_info(
- MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
- &AllocAllowed);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Alloc allowed in memory pool check failed: %s\n",
- get_error_string(Err));
- return Err;
- }
-
- size_t Size = 0;
- Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
- &Size);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Get memory pool size failed: %s\n", get_error_string(Err));
- return Err;
- }
-
- return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
-}
-
-hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
- std::vector<hsa_amd_memory_pool_t> *Result =
- static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
-
- hsa_status_t Err;
- if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
- return Err;
- }
-
- Result->push_back(MemoryPool);
- return HSA_STATUS_SUCCESS;
-}
-
-} // namespace
-} // namespace core
-
-struct EnvironmentVariables {
- int NumTeams;
- int TeamLimit;
- int TeamThreadLimit;
- int MaxTeamsDefault;
- int DynamicMemSize;
-};
-
-template <uint32_t wavesize>
-static constexpr const llvm::omp::GV &getGridValue() {
- return llvm::omp::getAMDGPUGridValues<wavesize>();
-}
-
-struct HSALifetime {
- // Wrapper around HSA used to ensure it is constructed before other types
- // and destructed after, which means said other types can use raii for
- // cleanup without risking running outside of the lifetime of HSA
- const hsa_status_t S;
-
- bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
- HSALifetime() : S(hsa_init()) {}
-
- ~HSALifetime() {
- if (S == HSA_STATUS_SUCCESS) {
- hsa_status_t Err = hsa_shut_down();
- if (Err != HSA_STATUS_SUCCESS) {
- // Can't call into HSA to get a string from the integer
- DP("Shutting down HSA failed: %d\n", Err);
- }
- }
- }
-};
-
-// Handle scheduling of multiple hsa_queue's per device to
-// multiple threads (one scheduler per device)
-class HSAQueueScheduler {
-public:
- HSAQueueScheduler() : Current(0) {}
-
- HSAQueueScheduler(const HSAQueueScheduler &) = delete;
-
- HSAQueueScheduler(HSAQueueScheduler &&Q) {
- Current = Q.Current.load();
- for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
- HSAQueues[I] = Q.HSAQueues[I];
- Q.HSAQueues[I] = nullptr;
- }
- }
-
- // \return false if any HSA queue creation fails
- bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
- for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
- hsa_queue_t *Q = nullptr;
- hsa_status_t Rc =
- hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
- callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
- if (Rc != HSA_STATUS_SUCCESS) {
- DP("Failed to create HSA queue %d\n", I);
- return false;
- }
- HSAQueues[I] = Q;
- }
- return true;
- }
-
- ~HSAQueueScheduler() {
- for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
- if (HSAQueues[I]) {
- hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
- if (Err != HSA_STATUS_SUCCESS)
- DP("Error destroying HSA queue");
- }
- }
- }
-
- // \return next queue to use for device
- hsa_queue_t *next() {
- return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
- NUM_QUEUES_PER_DEVICE];
- }
-
-private:
- // Number of queues per device
- enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
- hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
- std::atomic<uint8_t> Current;
-};
-
-/// Class containing all the device information
-class RTLDeviceInfoTy : HSALifetime {
- std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
- struct QueueDeleter {
- void operator()(hsa_queue_t *Q) {
- if (Q) {
- hsa_status_t Err = hsa_queue_destroy(Q);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Error destroying hsa queue: %s\n", get_error_string(Err));
- }
- }
- }
- };
-
-public:
- bool ConstructionSucceeded = false;
-
- // load binary populates symbol tables and mutates various global state
- // run uses those symbol tables
- std::shared_timed_mutex LoadRunLock;
-
- int NumberOfDevices = 0;
-
- // GPU devices
- std::vector<hsa_agent_t> HSAAgents;
- std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
-
- // CPUs
- std::vector<hsa_agent_t> CPUAgents;
-
- // Device properties
- std::vector<int> ComputeUnits;
- std::vector<int> GroupsPerDevice;
- std::vector<int> ThreadsPerGroup;
- std::vector<int> WarpSize;
- std::vector<std::string> GPUName;
- std::vector<std::string> TargetID;
-
- // OpenMP properties
- std::vector<int> NumTeams;
- std::vector<int> NumThreads;
-
- // OpenMP Environment properties
- EnvironmentVariables Env;
-
- // OpenMP Requires Flags
- int64_t RequiresFlags;
-
- // Resource pools
- SignalPoolT FreeSignalPool;
-
- bool HostcallRequired = false;
-
- std::vector<hsa_executable_t> HSAExecutables;
-
- std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
- std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
-
- hsa_amd_memory_pool_t KernArgPool;
-
- // fine grained memory pool for host allocations
- hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
-
- // fine and coarse-grained memory pools per offloading device
- std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
- std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
-
- struct ImplFreePtrDeletor {
- void operator()(void *P) {
- core::Runtime::Memfree(P); // ignore failure to free
- }
- };
-
- // device_State shared across loaded binaries, error if inconsistent size
- std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
- DeviceStateStore;
-
- static const unsigned HardTeamLimit =
- (1 << 16) - 1; // 64K needed to fit in uint16
- static const int DefaultNumTeams = 128;
-
- // These need to be per-device since
diff erent devices can have
diff erent
- // wave sizes, but are currently the same number for each so that refactor
- // can be postponed.
- static_assert(getGridValue<32>().GV_Max_Teams ==
- getGridValue<64>().GV_Max_Teams,
- "");
- static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
-
- static_assert(getGridValue<32>().GV_Max_WG_Size ==
- getGridValue<64>().GV_Max_WG_Size,
- "");
- static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
-
- static_assert(getGridValue<32>().GV_Default_WG_Size ==
- getGridValue<64>().GV_Default_WG_Size,
- "");
- static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
-
- using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
- hsa_agent_t, hsa_amd_memory_pool_t);
- hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
- MemcpyFunc Func, int32_t DeviceId) {
- hsa_agent_t Agent = HSAAgents[DeviceId];
- hsa_signal_t S = FreeSignalPool.pop();
- if (S.handle == 0) {
- return HSA_STATUS_ERROR;
- }
- hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
- FreeSignalPool.push(S);
- return R;
- }
-
- hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
- int32_t DeviceId) {
- return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
- }
-
- hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
- int32_t DeviceId) {
- return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
- }
-
- static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
- char TmpChar[1000];
- uint16_t Major, Minor;
- uint32_t TmpUInt;
- uint32_t TmpUInt2;
- uint32_t CacheSize[4];
- bool TmpBool;
- uint16_t WorkgroupMaxDim[3];
- hsa_dim3_t GridMaxDim;
-
- // Getting basic information about HSA and Device
- core::checkResult(
- hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
- "Error from hsa_system_get_info when obtaining "
- "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
- core::checkResult(
- hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
- "Error from hsa_system_get_info when obtaining "
- "HSA_SYSTEM_INFO_VERSION_MINOR\n");
- printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
- printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
- core::checkResult(
- hsa_agent_get_info(
- Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
- printf(" Product Name: \t\t\t%s \n", TmpChar);
- core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_NAME\n");
- printf(" Device Name: \t\t\t%s \n", TmpChar);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_NAME\n");
- printf(" Vendor Name: \t\t\t%s \n", TmpChar);
- hsa_device_type_t DevType;
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_DEVICE\n");
- printf(" Device Type: \t\t\t%s \n",
- DevType == HSA_DEVICE_TYPE_CPU
- ? "CPU"
- : (DevType == HSA_DEVICE_TYPE_GPU
- ? "GPU"
- : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_QUEUES_MAX\n");
- printf(" Max Queues: \t\t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
- printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
- printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
-
- // Getting cache information
- printf(" Cache:\n");
-
- // FIXME: This is deprecated according to HSA documentation. But using
- // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
- // runtime.
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_CACHE_SIZE\n");
-
- for (int I = 0; I < 4; I++) {
- if (CacheSize[I]) {
- printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
- }
- }
-
- core::checkResult(
- hsa_agent_get_info(Agent,
- (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
- &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
- printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(
- Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
- &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
- printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(
- Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
- &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
- printf(" Compute Units: \t\t\t%u \n", TmpUInt);
- core::checkResult(hsa_agent_get_info(
- Agent,
- (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
- &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
- printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
- printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
- printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
- printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
- core::checkResult(hsa_agent_get_info(Agent,
- HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
- WorkgroupMaxDim),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
- printf(" Workgroup Max Size per Dimension:\n");
- printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
- printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
- printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
- core::checkResult(hsa_agent_get_info(
- Agent,
- (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
- &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
- printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
- printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
- printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_GRID_MAX_DIM\n");
- printf(" Grid Max Size per Dimension: \t\t\n");
- printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
- printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
- printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
- core::checkResult(
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
- "Error returned from hsa_agent_get_info when obtaining "
- "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
- printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
-
- printf(" Memory Pools:\n");
- auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
- std::string TmpStr;
- size_t Size;
- bool Alloc, Access;
- hsa_amd_segment_t Segment;
- hsa_amd_memory_pool_global_flag_t GlobalFlags;
- core::checkResult(
- hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
- "Error returned from hsa_amd_memory_pool_get_info when obtaining "
- "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
- core::checkResult(hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
- "Error returned from hsa_amd_memory_pool_get_info when "
- "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
-
- switch (Segment) {
- case HSA_AMD_SEGMENT_GLOBAL:
- TmpStr = "GLOBAL; FLAGS: ";
- if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
- TmpStr += "KERNARG, ";
- if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
- TmpStr += "FINE GRAINED, ";
- if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
- TmpStr += "COARSE GRAINED, ";
- break;
- case HSA_AMD_SEGMENT_READONLY:
- TmpStr = "READONLY";
- break;
- case HSA_AMD_SEGMENT_PRIVATE:
- TmpStr = "PRIVATE";
- break;
- case HSA_AMD_SEGMENT_GROUP:
- TmpStr = "GROUP";
- break;
- }
- printf(" Pool %s: \n", TmpStr.c_str());
-
- core::checkResult(hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
- "Error returned from hsa_amd_memory_pool_get_info when "
- "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
- printf(" Size: \t\t\t\t %zu bytes\n", Size);
- core::checkResult(
- hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
- "Error returned from hsa_amd_memory_pool_get_info when obtaining "
- "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
- printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
- core::checkResult(
- hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
- "Error returned from hsa_amd_memory_pool_get_info when obtaining "
- "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
- printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
- core::checkResult(
- hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
- "Error returned from hsa_amd_memory_pool_get_info when obtaining "
- "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
- printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
- core::checkResult(
- hsa_amd_memory_pool_get_info(
- Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
- "Error returned from hsa_amd_memory_pool_get_info when obtaining "
- "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
- printf(" Accessable by all: \t\t %s\n",
- (Access ? "TRUE" : "FALSE"));
-
- return HSA_STATUS_SUCCESS;
- };
- // Iterate over all the memory regions for this agent. Get the memory region
- // type and size
- hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
-
- printf(" ISAs:\n");
- auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
- char TmpChar[1000];
- core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
- "Error returned from hsa_isa_get_info_alt when "
- "obtaining HSA_ISA_INFO_NAME\n");
- printf(" Name: \t\t\t\t %s\n", TmpChar);
-
- return HSA_STATUS_SUCCESS;
- };
- // Iterate over all the memory regions for this agent. Get the memory region
- // type and size
- hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
- }
-
- // Record entry point associated with device
- void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- E.Entries.push_back(Entry);
- }
-
- // Return true if the entry is associated with device
- bool findOffloadEntry(int32_t DeviceId, void *Addr) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- for (auto &It : E.Entries) {
- if (It.addr == Addr)
- return true;
- }
-
- return false;
- }
-
- // Return the pointer to the target entries table
- __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- int32_t Size = E.Entries.size();
-
- // Table is empty
- if (!Size)
- return 0;
-
- __tgt_offload_entry *Begin = &E.Entries[0];
- __tgt_offload_entry *End = &E.Entries[Size - 1];
-
- // Update table info according to the entries and return the pointer
- E.Table.EntriesBegin = Begin;
- E.Table.EntriesEnd = ++End;
-
- return &E.Table;
- }
-
- // Clear entries table for a device
- void clearOffloadEntriesTable(int DeviceId) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncGblEntries[DeviceId].emplace_back();
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
- // KernelArgPoolMap.clear();
- E.Entries.clear();
- E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
- }
-
- hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
- unsigned int DeviceId) {
- assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
- uint32_t GlobalFlags = 0;
- hsa_status_t Err = hsa_amd_memory_pool_get_info(
- MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
-
- if (Err != HSA_STATUS_SUCCESS) {
- return Err;
- }
-
- if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
- DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
- } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
- DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
- }
-
- return HSA_STATUS_SUCCESS;
- }
-
- hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
- for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
- hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
- Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
- hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
- if (ValidStatus != HSA_STATUS_SUCCESS) {
- DP("Alloc allowed in memory pool check failed: %s\n",
- get_error_string(ValidStatus));
- return HSA_STATUS_SUCCESS;
- }
- return addDeviceMemoryPool(MemoryPool, DeviceId);
- });
-
- if (Err != HSA_STATUS_SUCCESS) {
- DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Iterate all memory pools", get_error_string(Err));
- return Err;
- }
- }
- return HSA_STATUS_SUCCESS;
- }
-
- hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
- std::vector<hsa_amd_memory_pool_t> HostPools;
-
- // collect all the "valid" pools for all the given agents.
- for (const auto &Agent : Agents) {
- hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
- Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
- if (Err != HSA_STATUS_SUCCESS) {
- DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
- }
- }
-
- // We need two fine-grained pools.
- // 1. One with kernarg flag set for storing kernel arguments
- // 2. Second for host allocations
- bool FineGrainedMemoryPoolSet = false;
- bool KernArgPoolSet = false;
- for (const auto &MemoryPool : HostPools) {
- hsa_status_t Err = HSA_STATUS_SUCCESS;
- uint32_t GlobalFlags = 0;
- Err = hsa_amd_memory_pool_get_info(
- MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Get memory pool info failed: %s\n", get_error_string(Err));
- return Err;
- }
-
- if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
- if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
- KernArgPool = MemoryPool;
- KernArgPoolSet = true;
- } else {
- HostFineGrainedMemoryPool = MemoryPool;
- FineGrainedMemoryPoolSet = true;
- }
- }
- }
-
- if (FineGrainedMemoryPoolSet && KernArgPoolSet)
- return HSA_STATUS_SUCCESS;
-
- return HSA_STATUS_ERROR;
- }
-
- hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
- assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
- "Invalid device Id");
- return DeviceCoarseGrainedMemoryPools[DeviceId];
- }
-
- hsa_amd_memory_pool_t getHostMemoryPool() {
- return HostFineGrainedMemoryPool;
- }
-
- static int readEnv(const char *Env, int Default = -1) {
- const char *EnvStr = getenv(Env);
- int Res = Default;
- if (EnvStr) {
- Res = std::stoi(EnvStr);
- DP("Parsed %s=%d\n", Env, Res);
- }
- return Res;
- }
-
- RTLDeviceInfoTy() {
- DP("Start initializing " GETNAME(TARGET_NAME) "\n");
-
- // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
- // anytime. You do not need a debug library build.
- // 0 => no tracing
- // 1 => tracing dispatch only
- // >1 => verbosity increase
-
- if (!HSAInitSuccess()) {
- DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
- return;
- }
-
- if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
- print_kernel_trace = atoi(EnvStr);
- else
- print_kernel_trace = 0;
-
- hsa_status_t Err = core::atl_init_gpu_context();
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
- return;
- }
-
- // Init hostcall soon after initializing hsa
- hostrpc_init();
-
- Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
- if (DeviceType == HSA_DEVICE_TYPE_CPU) {
- CPUAgents.push_back(Agent);
- } else {
- HSAAgents.push_back(Agent);
- }
- });
- if (Err != HSA_STATUS_SUCCESS)
- return;
-
- NumberOfDevices = (int)HSAAgents.size();
-
- if (NumberOfDevices == 0) {
- DP("There are no devices supporting HSA.\n");
- return;
- }
- DP("There are %d devices supporting HSA.\n", NumberOfDevices);
-
- // Init the device info
- HSAQueueSchedulers.reserve(NumberOfDevices);
- FuncGblEntries.resize(NumberOfDevices);
- ThreadsPerGroup.resize(NumberOfDevices);
- ComputeUnits.resize(NumberOfDevices);
- GPUName.resize(NumberOfDevices);
- GroupsPerDevice.resize(NumberOfDevices);
- WarpSize.resize(NumberOfDevices);
- NumTeams.resize(NumberOfDevices);
- NumThreads.resize(NumberOfDevices);
- DeviceStateStore.resize(NumberOfDevices);
- KernelInfoTable.resize(NumberOfDevices);
- SymbolInfoTable.resize(NumberOfDevices);
- DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
- DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
-
- Err = setupDevicePools(HSAAgents);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Setup for Device Memory Pools failed\n");
- return;
- }
-
- Err = setupHostMemoryPools(CPUAgents);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Setup for Host Memory Pools failed\n");
- return;
- }
-
- for (int I = 0; I < NumberOfDevices; I++) {
- uint32_t QueueSize = 0;
- {
- hsa_status_t Err = hsa_agent_get_info(
- HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
- return;
- }
- enum { MaxQueueSize = 4096 };
- if (QueueSize > MaxQueueSize) {
- QueueSize = MaxQueueSize;
- }
- }
-
- {
- HSAQueueScheduler QSched;
- if (!QSched.createQueues(HSAAgents[I], QueueSize))
- return;
- HSAQueueSchedulers.emplace_back(std::move(QSched));
- }
-
- DeviceStateStore[I] = {nullptr, 0};
- }
-
- for (int I = 0; I < NumberOfDevices; I++) {
- ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
- GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
- ComputeUnits[I] = 1;
- DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
- GroupsPerDevice[I], ThreadsPerGroup[I]);
- }
-
- // Get environment variables regarding teams
- Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
- Env.NumTeams = readEnv("OMP_NUM_TEAMS");
- Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
- Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
- Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
-
- // Default state.
- RequiresFlags = OMP_REQ_UNDEFINED;
-
- ConstructionSucceeded = true;
- }
-
- ~RTLDeviceInfoTy() {
- DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
- if (!HSAInitSuccess()) {
- // Then none of these can have been set up and they can't be torn down
- return;
- }
- // Run destructors on types that use HSA before
- // impl_finalize removes access to it
- DeviceStateStore.clear();
- KernelArgPoolMap.clear();
- // Terminate hostrpc before finalizing hsa
- hostrpc_terminate();
-
- hsa_status_t Err;
- for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
- Err = hsa_executable_destroy(HSAExecutables[I]);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
- "Destroying executable", get_error_string(Err));
- }
- }
- }
-};
-
-pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
-
-// Putting accesses to DeviceInfo global behind a function call prior
-// to changing to use init_plugin/deinit_plugin calls
-static RTLDeviceInfoTy DeviceInfoState;
-static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
-
-namespace {
-
-int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
- __tgt_async_info *AsyncInfo) {
- assert(AsyncInfo && "AsyncInfo is nullptr");
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- // Return success if we are not copying back to host from target.
- if (!HstPtr)
- return OFFLOAD_SUCCESS;
- hsa_status_t Err;
- DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
- (long long unsigned)(Elf64_Addr)TgtPtr,
- (long long unsigned)(Elf64_Addr)HstPtr);
-
- Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
- DeviceId);
-
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Error when copying data from device to host. Pointers: "
- "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
- (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
- return OFFLOAD_FAIL;
- }
- DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
- (long long unsigned)(Elf64_Addr)TgtPtr,
- (long long unsigned)(Elf64_Addr)HstPtr);
- return OFFLOAD_SUCCESS;
-}
-
-int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
- __tgt_async_info *AsyncInfo) {
- assert(AsyncInfo && "AsyncInfo is nullptr");
- hsa_status_t Err;
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- // Return success if we are not doing host to target.
- if (!HstPtr)
- return OFFLOAD_SUCCESS;
-
- DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
- (long long unsigned)(Elf64_Addr)HstPtr,
- (long long unsigned)(Elf64_Addr)TgtPtr);
- Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
- DeviceId);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Error when copying data from host to device. Pointers: "
- "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
- (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
- return OFFLOAD_FAIL;
- }
- return OFFLOAD_SUCCESS;
-}
-
-// Async.
-// The implementation was written with cuda streams in mind. The semantics of
-// that are to execute kernels on a queue in order of insertion. A synchronise
-// call then makes writes visible between host and device. This means a series
-// of N data_submit_async calls are expected to execute serially. HSA offers
-// various options to run the data copies concurrently. This may require changes
-// to libomptarget.
-
-// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
-// there are no outstanding kernels that need to be synchronized. Any async call
-// may be passed a Queue==0, at which point the cuda implementation will set it
-// to non-null (see getStream). The cuda streams are per-device. Upstream may
-// change this interface to explicitly initialize the AsyncInfo_pointer, but
-// until then hsa lazily initializes it as well.
-
-void initAsyncInfo(__tgt_async_info *AsyncInfo) {
- // set non-null while using async calls, return to null to indicate completion
- assert(AsyncInfo);
- if (!AsyncInfo->Queue) {
- AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
- }
-}
-void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
- assert(AsyncInfo);
- assert(AsyncInfo->Queue);
- AsyncInfo->Queue = 0;
-}
-
-// Determine launch values for kernel.
-struct LaunchVals {
- int WorkgroupSize;
- int GridSize;
-};
-LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
- int ConstWGSize,
- llvm::omp::OMPTgtExecModeFlags ExecutionMode,
- int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
- int DeviceNumTeams) {
-
- int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
- int NumGroups = 0;
-
- int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
- if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
- MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
-
- if (print_kernel_trace & STARTUP_DETAILS) {
- DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
- DP("Max_Teams: %d\n", MaxTeams);
- DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
- DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
- DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
- RTLDeviceInfoTy::DefaultWgSize);
- DP("thread_limit: %d\n", ThreadLimit);
- DP("threadsPerGroup: %d\n", ThreadsPerGroup);
- DP("ConstWGSize: %d\n", ConstWGSize);
- }
- // check for thread_limit() clause
- if (ThreadLimit > 0) {
- ThreadsPerGroup = ThreadLimit;
- DP("Setting threads per block to requested %d\n", ThreadLimit);
- // Add master warp for GENERIC
- if (ExecutionMode ==
- llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
- ThreadsPerGroup += WarpSize;
- DP("Adding master wavefront: +%d threads\n", WarpSize);
- }
- if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
- ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
- DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
- }
- }
- // check flat_max_work_group_size attr here
- if (ThreadsPerGroup > ConstWGSize) {
- ThreadsPerGroup = ConstWGSize;
- DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
- ThreadsPerGroup);
- }
- if (print_kernel_trace & STARTUP_DETAILS)
- DP("threadsPerGroup: %d\n", ThreadsPerGroup);
- DP("Preparing %d threads\n", ThreadsPerGroup);
-
- // Set default num_groups (teams)
- if (Env.TeamLimit > 0)
- NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
- else
- NumGroups = MaxTeams;
- DP("Set default num of groups %d\n", NumGroups);
-
- if (print_kernel_trace & STARTUP_DETAILS) {
- DP("num_groups: %d\n", NumGroups);
- DP("num_teams: %d\n", NumTeams);
- }
-
- // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
- // This reduction is typical for default case (no thread_limit clause).
- // or when user goes crazy with num_teams clause.
- // FIXME: We cant distinguish between a constant or variable thread limit.
- // So we only handle constant thread_limits.
- if (ThreadsPerGroup >
- RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
- // Should we round threadsPerGroup up to nearest WarpSize
- // here?
- NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
-
- // check for num_teams() clause
- if (NumTeams > 0) {
- NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
- }
- if (print_kernel_trace & STARTUP_DETAILS) {
- DP("num_groups: %d\n", NumGroups);
- DP("Env.NumTeams %d\n", Env.NumTeams);
- DP("Env.TeamLimit %d\n", Env.TeamLimit);
- }
-
- if (Env.NumTeams > 0) {
- NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
- DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
- } else if (Env.TeamLimit > 0) {
- NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
- DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
- } else {
- if (NumTeams <= 0) {
- if (LoopTripcount > 0) {
- if (ExecutionMode ==
- llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
- // round up to the nearest integer
- NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
- } else if (ExecutionMode ==
- llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
- NumGroups = LoopTripcount;
- } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
- // This is a generic kernel that was transformed to use SPMD-mode
- // execution but uses Generic-mode semantics for scheduling.
- NumGroups = LoopTripcount;
- }
- DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
- "threads per block %d\n",
- NumGroups, LoopTripcount, ThreadsPerGroup);
- }
- } else {
- NumGroups = NumTeams;
- }
- if (NumGroups > MaxTeams) {
- NumGroups = MaxTeams;
- if (print_kernel_trace & STARTUP_DETAILS)
- DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
- }
- if (NumGroups > NumTeams && NumTeams > 0) {
- NumGroups = NumTeams;
- if (print_kernel_trace & STARTUP_DETAILS)
- DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
- NumTeams);
- }
- }
-
- // num_teams clause always honored, no matter what, unless DEFAULT is active.
- if (NumTeams > 0) {
- NumGroups = NumTeams;
- // Cap num_groups to EnvMaxTeamsDefault if set.
- if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
- NumGroups = Env.MaxTeamsDefault;
- }
- if (print_kernel_trace & STARTUP_DETAILS) {
- DP("threadsPerGroup: %d\n", ThreadsPerGroup);
- DP("num_groups: %d\n", NumGroups);
- DP("loop_tripcount: %ld\n", LoopTripcount);
- }
- DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
- ThreadsPerGroup);
-
- LaunchVals Res;
- Res.WorkgroupSize = ThreadsPerGroup;
- Res.GridSize = ThreadsPerGroup * NumGroups;
- return Res;
-}
-
-static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
- uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
- bool Full = true;
- while (Full) {
- Full =
- PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
- }
- return PacketId;
-}
-
-int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
- ptr
diff _t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
- int32_t ThreadLimit, uint64_t LoopTripcount) {
- // Set the context we are using
- // update thread limit content in gpu memory if un-initialized or specified
- // from host
-
- DP("Run target team region thread_limit %d\n", ThreadLimit);
-
- // All args are references.
- std::vector<void *> Args(ArgNum);
- std::vector<void *> Ptrs(ArgNum);
-
- DP("Arg_num: %d\n", ArgNum);
- for (int32_t I = 0; I < ArgNum; ++I) {
- Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
- Args[I] = &Ptrs[I];
- DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
- }
-
- KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
-
- std::string KernelName = std::string(KernelInfo->Name);
- auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
- if (KernelInfoTable[DeviceId].find(KernelName) ==
- KernelInfoTable[DeviceId].end()) {
- DP("Kernel %s not found\n", KernelName.c_str());
- return OFFLOAD_FAIL;
- }
-
- const atl_kernel_info_t KernelInfoEntry =
- KernelInfoTable[DeviceId][KernelName];
- const uint32_t GroupSegmentSize =
- KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
- const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
- const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
- const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
- const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
-
- assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
-
- /*
- * Set limit based on ThreadsPerGroup and GroupsPerDevice
- */
- LaunchVals LV =
- getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
- KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
- NumTeams, // From run_region arg
- ThreadLimit, // From run_region arg
- LoopTripcount, // From run_region arg
- DeviceInfo().NumTeams[KernelInfo->DeviceId]);
- const int GridSize = LV.GridSize;
- const int WorkgroupSize = LV.WorkgroupSize;
-
- if (print_kernel_trace >= LAUNCH) {
- int NumGroups = GridSize / WorkgroupSize;
- // enum modes are SPMD, GENERIC, NONE 0,1,2
- // if doing rtl timing, print to stderr, unless stdout requested.
- bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
- fprintf(TraceToStdout ? stdout : stderr,
- "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
- "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
- "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
- DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
- ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
- GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
- VgprSpillCount, LoopTripcount, KernelInfo->Name);
- }
-
- // Run on the device.
- {
- hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
- if (!Queue) {
- return OFFLOAD_FAIL;
- }
- uint64_t PacketId = acquireAvailablePacketId(Queue);
-
- const uint32_t Mask = Queue->size - 1; // size is a power of 2
- hsa_kernel_dispatch_packet_t *Packet =
- (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
-
- // packet->header is written last
- Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
- Packet->workgroup_size_x = WorkgroupSize;
- Packet->workgroup_size_y = 1;
- Packet->workgroup_size_z = 1;
- Packet->reserved0 = 0;
- Packet->grid_size_x = GridSize;
- Packet->grid_size_y = 1;
- Packet->grid_size_z = 1;
- Packet->private_segment_size = KernelInfoEntry.private_segment_size;
- Packet->group_segment_size = GroupSegmentSize;
- Packet->kernel_object = KernelInfoEntry.kernel_object;
- Packet->kernarg_address = 0; // use the block allocator
- Packet->reserved2 = 0; // impl writes id_ here
- Packet->completion_signal = {0}; // may want a pool of signals
-
- KernelArgPool *ArgPool = nullptr;
- void *KernArg = nullptr;
- {
- auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
- if (It != KernelArgPoolMap.end()) {
- ArgPool = (It->second).get();
- }
- }
- if (!ArgPool) {
- DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
- DeviceId);
- }
- {
- if (ArgPool) {
- assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
- KernArg = ArgPool->allocate(ArgNum);
- }
- if (!KernArg) {
- DP("Allocate kernarg failed\n");
- return OFFLOAD_FAIL;
- }
-
- // Copy explicit arguments
- for (int I = 0; I < ArgNum; I++) {
- memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
- }
-
- // Initialize implicit arguments. TODO: Which of these can be dropped
- AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>(
- static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
- memset(ImplArgs, 0,
- sizeof(AMDGPUImplicitArgsTy)); // may not be necessary
- ImplArgs->OffsetX = 0;
- ImplArgs->OffsetY = 0;
- ImplArgs->OffsetZ = 0;
-
- // assign a hostcall buffer for the selected Q
- if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
- // hostrpc_assign_buffer is not thread safe, and this function is
- // under a multiple reader lock, not a writer lock.
- static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
- pthread_mutex_lock(&HostcallInitLock);
- uint64_t Buffer = hostrpc_assign_buffer(
- DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
- pthread_mutex_unlock(&HostcallInitLock);
- if (!Buffer) {
- DP("hostrpc_assign_buffer failed, gpu would dereference null and "
- "error\n");
- return OFFLOAD_FAIL;
- }
-
- DP("Implicit argument count: %d\n",
- KernelInfoEntry.implicit_argument_count);
- if (KernelInfoEntry.implicit_argument_count >= 4) {
- // Initialise pointer for implicit_argument_count != 0 ABI
- // Guess that the right implicit argument is at offset 24 after
- // the explicit arguments. In the future, should be able to read
- // the offset from msgpack. Clang is not annotating it at present.
- uint64_t Offset =
- sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
- if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
- DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
- "args: %d\n",
- Offset + 8, ArgPool->kernargSizeIncludingImplicit());
- } else {
- memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
- }
- }
-
- // initialise pointer for implicit_argument_count == 0 ABI
- ImplArgs->HostcallPtr = Buffer;
- }
-
- Packet->kernarg_address = KernArg;
- }
-
- hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
- if (S.handle == 0) {
- DP("Failed to get signal instance\n");
- return OFFLOAD_FAIL;
- }
- Packet->completion_signal = S;
- hsa_signal_store_relaxed(Packet->completion_signal, 1);
-
- // Publish the packet indicating it is ready to be processed
- core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
- core::createHeader(), Packet->setup);
-
- // Since the packet is already published, its contents must not be
- // accessed any more
- hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
-
- while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
- HSA_WAIT_STATE_BLOCKED) != 0)
- ;
-
- assert(ArgPool);
- ArgPool->deallocate(KernArg);
- DeviceInfo().FreeSignalPool.push(S);
- }
-
- DP("Kernel completed\n");
- return OFFLOAD_SUCCESS;
-}
-
-bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
- const uint16_t AmdgcnMachineID = EM_AMDGPU;
- const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
- if (!R) {
- DP("Supported machine ID not found\n");
- }
- return R;
-}
-
-uint32_t elfEFlags(__tgt_device_image *Image) {
- const char *ImgBegin = (char *)Image->ImageStart;
- size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
-
- StringRef Buffer = StringRef(ImgBegin, ImgSize);
- auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
- /*InitContent=*/false);
- if (!ElfOrErr) {
- consumeError(ElfOrErr.takeError());
- return 0;
- }
-
- if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
- return ELFObj->getPlatformFlags();
- return 0;
-}
-
-template <typename T> bool enforceUpperBound(T *Value, T Upper) {
- bool Changed = *Value > Upper;
- if (Changed) {
- *Value = Upper;
- }
- return Changed;
-}
-
-struct SymbolInfo {
- const void *Addr = nullptr;
- uint32_t Size = UINT32_MAX;
- uint32_t ShType = SHT_NULL;
-};
-
-int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
- StringRef SymName, SymbolInfo *Res) {
- auto SymOrErr = getELFSymbol(ELFObj, SymName);
- if (!SymOrErr) {
- std::string ErrorString = toString(SymOrErr.takeError());
- DP("Failed ELF lookup: %s\n", ErrorString.c_str());
- return 1;
- }
- if (!*SymOrErr)
- return 1;
-
- auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
- if (!SymSecOrErr) {
- std::string ErrorString = toString(SymOrErr.takeError());
- DP("Failed ELF lookup: %s\n", ErrorString.c_str());
- return 1;
- }
-
- Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
- Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
- Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
- return 0;
-}
-
-int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
- SymbolInfo *Res) {
- StringRef Buffer = StringRef(Base, ImgSize);
- auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
- /*InitContent=*/false);
- if (!ElfOrErr) {
- REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
- return 1;
- }
-
- if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
- return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
- return 1;
-}
-
-hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
- const char *SymName, const void **VarAddr,
- uint32_t *VarSize) {
- SymbolInfo SI;
- int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
- if (Rc == 0) {
- *VarAddr = SI.Addr;
- *VarSize = SI.Size;
- return HSA_STATUS_SUCCESS;
- }
- return HSA_STATUS_ERROR;
-}
-
-template <typename C>
-hsa_status_t moduleRegisterFromMemoryToPlace(
- std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
- std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
- void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
- std::vector<hsa_executable_t> &HSAExecutables) {
- auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
- C *Unwrapped = static_cast<C *>(CbState);
- return (*Unwrapped)(Data, Size);
- };
- return core::RegisterModuleFromMemory(
- KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
- DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
- HSAExecutables);
-}
-
-uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
- uint64_t DeviceStateBytes = 0;
- {
- // If this is the deviceRTL, get the state variable size
- SymbolInfo SizeSi;
- int Rc = getSymbolInfoWithoutLoading(
- ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
-
- if (Rc == 0) {
- if (SizeSi.Size != sizeof(uint64_t)) {
- DP("Found device_State_size variable with wrong size\n");
- return 0;
- }
-
- // Read number of bytes directly from the elf
- memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
- }
- }
- return DeviceStateBytes;
-}
-
-struct DeviceEnvironment {
- // initialise an DeviceEnvironmentTy in the deviceRTL
- // patches around
diff erences in the deviceRTL between trunk, aomp,
- // rocmcc. Over time these
diff erences will tend to zero and this class
- // simplified.
- // Symbol may be in .data or .bss, and may be missing fields, todo:
- // review aomp/trunk/rocm and simplify the following
-
- // The symbol may also have been deadstripped because the device side
- // accessors were unused.
-
- // If the symbol is in .data (aomp, rocm) it can be written directly.
- // If it is in .bss, we must wait for it to be allocated space on the
- // gpu (trunk) and initialize after loading.
- const char *sym() { return "__omp_rtl_device_environment"; }
-
- DeviceEnvironmentTy HostDeviceEnv;
- SymbolInfo SI;
- bool Valid = false;
-
- __tgt_device_image *Image;
- const size_t ImgSize;
-
- DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
- __tgt_device_image *Image, const size_t ImgSize)
- : Image(Image), ImgSize(ImgSize) {
-
- HostDeviceEnv.NumDevices = NumberDevices;
- HostDeviceEnv.DeviceNum = DeviceId;
- HostDeviceEnv.DebugKind = 0;
- HostDeviceEnv.DynamicMemSize = DynamicMemSize;
- if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
- HostDeviceEnv.DebugKind = std::stoi(EnvStr);
-
- int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
- sym(), &SI);
- if (Rc != 0) {
- DP("Finding global device environment '%s' - symbol missing.\n", sym());
- return;
- }
-
- if (SI.Size > sizeof(HostDeviceEnv)) {
- DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
- sizeof(HostDeviceEnv));
- return;
- }
-
- Valid = true;
- }
-
- bool inImage() { return SI.ShType != SHT_NOBITS; }
-
- hsa_status_t beforeLoading(void *Data, size_t Size) {
- if (Valid) {
- if (inImage()) {
- DP("Setting global device environment before load (%u bytes)\n",
- SI.Size);
- uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
- reinterpret_cast<const char *>(Image->ImageStart);
- void *Pos = reinterpret_cast<char *>(Data) + Offset;
- memcpy(Pos, &HostDeviceEnv, SI.Size);
- }
- }
- return HSA_STATUS_SUCCESS;
- }
-
- hsa_status_t afterLoading() {
- if (Valid) {
- if (!inImage()) {
- DP("Setting global device environment after load (%u bytes)\n",
- SI.Size);
- int DeviceId = HostDeviceEnv.DeviceNum;
- auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
- void *StatePtr;
- uint32_t StatePtrSize;
- hsa_status_t Err = interop_hsa_get_symbol_info(
- SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("failed to find %s in loaded image\n", sym());
- return Err;
- }
-
- if (StatePtrSize != SI.Size) {
- DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
- SI.Size);
- return HSA_STATUS_ERROR;
- }
-
- return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
- StatePtrSize, DeviceId);
- }
- }
- return HSA_STATUS_SUCCESS;
- }
-};
-
-hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
- uint64_t Rounded = 4 * ((Size + 3) / 4);
- void *Ptr;
- hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
- hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
- if (Err != HSA_STATUS_SUCCESS) {
- return Err;
- }
-
- hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
- if (Rc != HSA_STATUS_SUCCESS) {
- DP("zero fill device_state failed with %u\n", Rc);
- core::Runtime::Memfree(Ptr);
- return HSA_STATUS_ERROR;
- }
-
- *RetPtr = Ptr;
- return HSA_STATUS_SUCCESS;
-}
-
-bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
- SymbolInfo SI;
- int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
- return (Rc == 0) && (SI.Addr != nullptr);
-}
-
-hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
- void **LockedHostPtr) {
- hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
- if (err != HSA_STATUS_SUCCESS)
- return err;
-
- // HostPtr is already locked, just return it
- if (*LockedHostPtr)
- return HSA_STATUS_SUCCESS;
-
- hsa_agent_t Agents[1] = {Agent};
- return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
- LockedHostPtr);
-}
-
-hsa_status_t unlock_memory(void *HostPtr) {
- void *LockedHostPtr = nullptr;
- hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
- if (err != HSA_STATUS_SUCCESS)
- return err;
-
- // if LockedHostPtr is nullptr, then HostPtr was not locked
- if (!LockedHostPtr)
- return HSA_STATUS_SUCCESS;
-
- err = hsa_amd_memory_unlock(HostPtr);
- return err;
-}
-
-} // namespace
-
-namespace core {
-hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
- return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
- &DeviceInfo().HSAAgents[0], NULL, Ptr);
-}
-} // namespace core
-
-static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
- hsa_status_t err;
- uint32_t name_len;
- err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error getting ISA info length\n");
- return err;
- }
-
- char TargetID[name_len];
- err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error getting ISA info name\n");
- return err;
- }
-
- auto TripleTargetID = llvm::StringRef(TargetID);
- if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
- DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
- }
- return HSA_STATUS_SUCCESS;
-}
-
-extern "C" {
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
- return elfMachineIdIsAmdgcn(Image);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
- __tgt_image_info *info) {
- if (!__tgt_rtl_is_valid_binary(image))
- return false;
-
- // A subarchitecture was not specified. Assume it is compatible.
- if (!info->Arch)
- return true;
-
- int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
-
- for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
- __tgt_rtl_init_device(DeviceId);
- hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
- hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error iterating ISAs\n");
- return false;
- }
- if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId]))
- return false;
- }
- DP("Image has Target ID compatible with the current environment: %s\n",
- info->Arch);
- return true;
-}
-
-int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
-int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
-
-int __tgt_rtl_number_of_devices() {
- // If the construction failed, no methods are safe to call
- if (DeviceInfo().ConstructionSucceeded) {
- return DeviceInfo().NumberOfDevices;
- }
- DP("AMDGPU plugin construction failed. Zero devices available\n");
- return 0;
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
- DP("Init requires flags to %ld\n", RequiresFlags);
- DeviceInfo().RequiresFlags = RequiresFlags;
- return RequiresFlags;
-}
-
-int32_t __tgt_rtl_init_device(int DeviceId) {
- hsa_status_t Err = hsa_init();
- if (Err != HSA_STATUS_SUCCESS) {
- DP("HSA Initialization Failed.\n");
- return HSA_STATUS_ERROR;
- }
- // this is per device id init
- DP("Initialize the device id: %d\n", DeviceId);
-
- hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
-
- // Get number of Compute Unit
- uint32_t ComputeUnits = 0;
- Err = hsa_agent_get_info(
- Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
- &ComputeUnits);
- if (Err != HSA_STATUS_SUCCESS) {
- DeviceInfo().ComputeUnits[DeviceId] = 1;
- DP("Error getting compute units : settiing to 1\n");
- } else {
- DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
- DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
- }
-
- char GetInfoName[64]; // 64 max size returned by get info
- Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
- (void *)GetInfoName);
- if (Err)
- DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
- else {
- DeviceInfo().GPUName[DeviceId] = GetInfoName;
- }
-
- if (print_kernel_trace & STARTUP_DETAILS)
- DP("Device#%-2d CU's: %2d %s\n", DeviceId,
- DeviceInfo().ComputeUnits[DeviceId],
- DeviceInfo().GPUName[DeviceId].c_str());
-
- // Query attributes to determine number of threads/block and blocks/grid.
- uint16_t WorkgroupMaxDim[3];
- Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
- &WorkgroupMaxDim);
- if (Err != HSA_STATUS_SUCCESS) {
- DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
- DP("Error getting grid dims: num groups : %d\n",
- RTLDeviceInfoTy::DefaultNumTeams);
- } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
- DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
- DP("Using %d ROCm blocks per grid\n",
- DeviceInfo().GroupsPerDevice[DeviceId]);
- } else {
- DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
- DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
- "at the hard limit\n",
- WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
- }
-
- // Get thread limit
- hsa_dim3_t GridMaxDim;
- Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
- if (Err == HSA_STATUS_SUCCESS) {
- DeviceInfo().ThreadsPerGroup[DeviceId] =
- reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
- DeviceInfo().GroupsPerDevice[DeviceId];
-
- if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
- DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
- DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
- } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
- RTLDeviceInfoTy::MaxWgSize)) {
- DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
- } else {
- DP("Using ROCm Queried thread limit: %d\n",
- DeviceInfo().ThreadsPerGroup[DeviceId]);
- }
- } else {
- DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
- DP("Error getting max block dimension, use default:%d \n",
- RTLDeviceInfoTy::MaxWgSize);
- }
-
- // Get wavefront size
- uint32_t WavefrontSize = 0;
- Err =
- hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
- if (Err == HSA_STATUS_SUCCESS) {
- DP("Queried wavefront size: %d\n", WavefrontSize);
- DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
- } else {
- // TODO: Burn the wavefront size into the code object
- DP("Warning: Unknown wavefront size, assuming 64\n");
- DeviceInfo().WarpSize[DeviceId] = 64;
- }
-
- // Adjust teams to the env variables
-
- if (DeviceInfo().Env.TeamLimit > 0 &&
- (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
- DeviceInfo().Env.TeamLimit))) {
- DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
- DeviceInfo().Env.TeamLimit);
- }
-
- // Set default number of teams
- if (DeviceInfo().Env.NumTeams > 0) {
- DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
- DP("Default number of teams set according to environment %d\n",
- DeviceInfo().Env.NumTeams);
- } else {
- char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
- int TeamsPerCU = DefaultTeamsPerCU;
- if (TeamsPerCUEnvStr) {
- TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
- }
-
- DeviceInfo().NumTeams[DeviceId] =
- TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
- DP("Default number of teams = %d * number of compute units %d\n",
- TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
- }
-
- if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
- DeviceInfo().GroupsPerDevice[DeviceId])) {
- DP("Default number of teams exceeds device limit, capping at %d\n",
- DeviceInfo().GroupsPerDevice[DeviceId]);
- }
-
- // Adjust threads to the env variables
- if (DeviceInfo().Env.TeamThreadLimit > 0 &&
- (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
- DeviceInfo().Env.TeamThreadLimit))) {
- DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
- DeviceInfo().Env.TeamThreadLimit);
- }
-
- // Set default number of threads
- DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
- DP("Default number of threads set according to library's default %d\n",
- RTLDeviceInfoTy::DefaultWgSize);
- if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
- DeviceInfo().ThreadsPerGroup[DeviceId])) {
- DP("Default number of threads exceeds device limit, capping at %d\n",
- DeviceInfo().ThreadsPerGroup[DeviceId]);
- }
-
- DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
- DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
- DeviceInfo().ThreadsPerGroup[DeviceId]);
-
- DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
- DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
- DeviceInfo().GroupsPerDevice[DeviceId],
- DeviceInfo().GroupsPerDevice[DeviceId] *
- DeviceInfo().ThreadsPerGroup[DeviceId]);
-
- return OFFLOAD_SUCCESS;
-}
-
-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
- __tgt_device_image *Image) {
- DeviceInfo().LoadRunLock.lock();
- __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
- DeviceInfo().LoadRunLock.unlock();
- return Res;
-}
-
-__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
- __tgt_device_image *Image) {
- // This function loads the device image onto gpu[DeviceId] and does other
- // per-image initialization work. Specifically:
- //
- // - Initialize an DeviceEnvironmentTy instance embedded in the
- // image at the symbol "__omp_rtl_device_environment"
- // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
- //
- // - Allocate a large array per-gpu (could be moved to init_device)
- // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
- // - Allocate at least that many bytes of gpu memory
- // - Zero initialize it
- // - Write the pointer to the symbol omptarget_nvptx_device_State
- //
- // - Pulls some per-kernel information together from various sources and
- // records it in the KernelsList for quicker access later
- //
- // The initialization can be done before or after loading the image onto the
- // gpu. This function presently does a mixture. Using the hsa api to get/set
- // the information is simpler to implement, in exchange for more complicated
- // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
- // back from the gpu vs a hashtable lookup on the host.
-
- const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
-
- DeviceInfo().clearOffloadEntriesTable(DeviceId);
-
- // We do not need to set the ELF version because the caller of this function
- // had to do that to decide the right runtime to use
-
- if (!elfMachineIdIsAmdgcn(Image))
- return NULL;
-
- {
- auto Env =
- DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
- DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
-
- auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
- auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
- hsa_status_t Err = moduleRegisterFromMemoryToPlace(
- KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
- [&](void *Data, size_t Size) {
- if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
- __atomic_store_n(&DeviceInfo().HostcallRequired, true,
- __ATOMIC_RELEASE);
- }
- return Env.beforeLoading(Data, Size);
- },
- DeviceInfo().HSAExecutables);
-
- check("Module registering", Err);
- if (Err != HSA_STATUS_SUCCESS) {
- const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
- const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
-
- if (strcmp(DeviceName, ElfName) != 0) {
- DP("Possible gpu arch mismatch: device:%s, image:%s please check"
- " compiler flag: -march=<gpu>\n",
- DeviceName, ElfName);
- } else {
- DP("Error loading image onto GPU: %s\n", get_error_string(Err));
- }
-
- return NULL;
- }
-
- Err = Env.afterLoading();
- if (Err != HSA_STATUS_SUCCESS) {
- return NULL;
- }
- }
-
- DP("AMDGPU module successfully loaded!\n");
-
- {
- // the device_State array is either large value in bss or a void* that
- // needs to be assigned to a pointer to an array of size device_state_bytes
- // If absent, it has been deadstripped and needs no setup.
-
- void *StatePtr;
- uint32_t StatePtrSize;
- auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
- hsa_status_t Err = interop_hsa_get_symbol_info(
- SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
- &StatePtrSize);
-
- if (Err != HSA_STATUS_SUCCESS) {
- DP("No device_state symbol found, skipping initialization\n");
- } else {
- if (StatePtrSize < sizeof(void *)) {
- DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
- sizeof(void *));
- return NULL;
- }
-
- // if it's larger than a void*, assume it's a bss array and no further
- // initialization is required. Only try to set up a pointer for
- // sizeof(void*)
- if (StatePtrSize == sizeof(void *)) {
- uint64_t DeviceStateBytes =
- getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
- if (DeviceStateBytes == 0) {
- DP("Can't initialize device_State, missing size information\n");
- return NULL;
- }
-
- auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
- if (DSS.first.get() == nullptr) {
- assert(DSS.second == 0);
- void *Ptr = NULL;
- hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Failed to allocate device_state array\n");
- return NULL;
- }
- DSS = {
- std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
- DeviceStateBytes,
- };
- }
-
- void *Ptr = DSS.first.get();
- if (DeviceStateBytes != DSS.second) {
- DP("Inconsistent sizes of device_State unsupported\n");
- return NULL;
- }
-
- // write ptr to device memory so it can be used by later kernels
- Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
- sizeof(void *), DeviceId);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("memcpy install of state_ptr failed\n");
- return NULL;
- }
- }
- }
- }
-
- // Here, we take advantage of the data that is appended after img_end to get
- // the symbols' name we need to load. This data consist of the host entries
- // begin and end as well as the target name (see the offloading linker script
- // creation in clang compiler).
-
- // Find the symbols in the module by name. The name can be obtain by
- // concatenating the host entry name with the target name
-
- __tgt_offload_entry *HostBegin = Image->EntriesBegin;
- __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
- for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-
- if (!E->addr) {
- // The host should have always something in the address to
- // uniquely identify the target region.
- DP("Analyzing host entry '<null>' (size = %lld)...\n",
- (unsigned long long)E->size);
- return NULL;
- }
-
- if (E->size) {
- __tgt_offload_entry Entry = *E;
-
- void *Varptr;
- uint32_t Varsize;
-
- auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
- hsa_status_t Err = interop_hsa_get_symbol_info(
- SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
-
- if (Err != HSA_STATUS_SUCCESS) {
- // Inform the user what symbol prevented offloading
- DP("Loading global '%s' (Failed)\n", E->name);
- return NULL;
- }
-
- if (Varsize != E->size) {
- DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
- Varsize, E->size);
- return NULL;
- }
-
- DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
- DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
- Entry.addr = (void *)Varptr;
-
- DeviceInfo().addOffloadEntry(DeviceId, Entry);
-
- if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
- E->flags & OMP_DECLARE_TARGET_LINK) {
- // If unified memory is present any target link variables
- // can access host addresses directly. There is no longer a
- // need for device copies.
- Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
- sizeof(void *), DeviceId);
- if (Err != HSA_STATUS_SUCCESS)
- DP("Error when copying USM\n");
- DP("Copy linked variable host address (" DPxMOD ")"
- "to device address (" DPxMOD ")\n",
- DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
- }
-
- continue;
- }
-
- DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
-
- // errors in kernarg_segment_size previously treated as = 0 (or as undef)
- uint32_t KernargSegmentSize = 0;
- auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
- hsa_status_t Err = HSA_STATUS_SUCCESS;
- if (!E->name) {
- Err = HSA_STATUS_ERROR;
- } else {
- std::string KernelStr = std::string(E->name);
- auto It = KernelInfoMap.find(KernelStr);
- if (It != KernelInfoMap.end()) {
- atl_kernel_info_t Info = It->second;
- KernargSegmentSize = Info.kernel_segment_size;
- } else {
- Err = HSA_STATUS_ERROR;
- }
- }
-
- // default value GENERIC (in case symbol is missing from cubin file)
- llvm::omp::OMPTgtExecModeFlags ExecModeVal =
- llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
-
- // get flat group size if present, else Default_WG_Size
- int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
-
- // get Kernel Descriptor if present.
- // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
- struct KernDescValType {
- uint16_t Version;
- uint16_t TSize;
- uint16_t WGSize;
- };
- struct KernDescValType KernDescVal;
- std::string KernDescNameStr(E->name);
- KernDescNameStr += "_kern_desc";
- const char *KernDescName = KernDescNameStr.c_str();
-
- const void *KernDescPtr;
- uint32_t KernDescSize;
- void *CallStackAddr = nullptr;
- Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
- &KernDescPtr, &KernDescSize);
-
- if (Err == HSA_STATUS_SUCCESS) {
- if ((size_t)KernDescSize != sizeof(KernDescVal))
- DP("Loading global computation properties '%s' - size mismatch (%u != "
- "%lu)\n",
- KernDescName, KernDescSize, sizeof(KernDescVal));
-
- memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
-
- // Check structure size against recorded size.
- if ((size_t)KernDescSize != KernDescVal.TSize)
- DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
- sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
-
- DP("After loading global for %s KernDesc \n", KernDescName);
- DP("KernDesc: Version: %d\n", KernDescVal.Version);
- DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
- DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
-
- if (KernDescVal.WGSize == 0) {
- KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
- DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
- }
- WGSizeVal = KernDescVal.WGSize;
- DP("WGSizeVal %d\n", WGSizeVal);
- check("Loading KernDesc computation property", Err);
- } else {
- DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
-
- // Flat group size
- std::string WGSizeNameStr(E->name);
- WGSizeNameStr += "_wg_size";
- const char *WGSizeName = WGSizeNameStr.c_str();
-
- const void *WGSizePtr;
- uint32_t WGSize;
- Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
- &WGSizePtr, &WGSize);
-
- if (Err == HSA_STATUS_SUCCESS) {
- if ((size_t)WGSize != sizeof(int16_t)) {
- DP("Loading global computation properties '%s' - size mismatch (%u "
- "!= "
- "%lu)\n",
- WGSizeName, WGSize, sizeof(int16_t));
- return NULL;
- }
-
- memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
-
- DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
-
- if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
- WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
- DP("Error wrong WGSize value specified in HSA code object file: "
- "%d\n",
- WGSizeVal);
- WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
- }
- } else {
- DP("Warning: Loading WGSize '%s' - symbol not found, "
- "using default value %d\n",
- WGSizeName, WGSizeVal);
- }
-
- check("Loading WGSize computation property", Err);
- }
-
- // Read execution mode from global in binary
- std::string ExecModeNameStr(E->name);
- ExecModeNameStr += "_exec_mode";
- const char *ExecModeName = ExecModeNameStr.c_str();
-
- const void *ExecModePtr;
- uint32_t VarSize;
- Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
- &ExecModePtr, &VarSize);
-
- if (Err == HSA_STATUS_SUCCESS) {
- if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
- DP("Loading global computation properties '%s' - size mismatch(%u != "
- "%lu)\n",
- ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
- return NULL;
- }
-
- memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
-
- DP("After loading global for %s ExecMode = %d\n", ExecModeName,
- ExecModeVal);
-
- if (ExecModeVal < 0 ||
- ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
- DP("Error wrong exec_mode value specified in HSA code object file: "
- "%d\n",
- ExecModeVal);
- return NULL;
- }
- } else {
- DP("Loading global exec_mode '%s' - symbol missing, using default "
- "value "
- "GENERIC (1)\n",
- ExecModeName);
- }
- check("Loading computation property", Err);
-
- KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
- CallStackAddr, E->name, KernargSegmentSize,
- DeviceInfo().KernArgPool));
- __tgt_offload_entry Entry = *E;
- Entry.addr = (void *)&KernelsList.back();
- DeviceInfo().addOffloadEntry(DeviceId, Entry);
- DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
- }
-
- return DeviceInfo().getOffloadEntriesTable(DeviceId);
-}
-
-void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
- void *Ptr = NULL;
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
- hsa_amd_memory_pool_t MemoryPool;
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- // GPU memory
- MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
- break;
- case TARGET_ALLOC_HOST:
- // non-migratable memory accessible by host and device(s)
- MemoryPool = DeviceInfo().getHostMemoryPool();
- break;
- default:
- REPORT("Invalid target data allocation kind or requested allocator not "
- "implemented yet\n");
- return NULL;
- }
-
- hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
- DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
- (long long unsigned)(Elf64_Addr)Ptr);
- Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
- return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- __tgt_async_info AsyncInfo;
- int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size, __tgt_async_info *AsyncInfo) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- if (AsyncInfo) {
- initAsyncInfo(AsyncInfo);
- return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
- }
- return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- __tgt_async_info AsyncInfo;
- int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size,
- __tgt_async_info *AsyncInfo) {
- assert(AsyncInfo && "AsyncInfo is nullptr");
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- initAsyncInfo(AsyncInfo);
- return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- // HSA can free pointers allocated from
diff erent types of memory pool.
- hsa_status_t Err;
- DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
- Err = core::Runtime::Memfree(TgtPtr);
- if (Err != HSA_STATUS_SUCCESS) {
- DP("Error when freeing CUDA memory\n");
- return OFFLOAD_FAIL;
- }
- return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- KernelArgsTy *KernelArgs,
- __tgt_async_info *AsyncInfo) {
- assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
- !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
- "Only one dimensional kernels supported.");
- assert(AsyncInfo && "AsyncInfo is nullptr");
- initAsyncInfo(AsyncInfo);
-
- DeviceInfo().LoadRunLock.lock_shared();
- int32_t Res =
- runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
- KernelArgs->NumArgs, KernelArgs->NumTeams[0],
- KernelArgs->ThreadLimit[0], KernelArgs->Tripcount);
-
- DeviceInfo().LoadRunLock.unlock_shared();
- return Res;
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
- // is not ensured by devices.cpp for amdgcn
- // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
- if (AsyncInfo->Queue) {
- finiAsyncInfo(AsyncInfo);
- }
- return OFFLOAD_SUCCESS;
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
- // TODO: Assertion to see if DeviceId is correct
- // NOTE: We don't need to set context for print device info.
-
- DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
-}
-
-int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
- void **LockedHostPtr) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
- hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
- hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error in tgt_rtl_data_lock\n");
- return OFFLOAD_FAIL;
- }
- DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,
- (long long unsigned)(Elf64_Addr)*LockedHostPtr);
- return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
- assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
- hsa_status_t err = unlock_memory(HostPtr);
- if (err != HSA_STATUS_SUCCESS) {
- DP("Error in tgt_rtl_data_unlock\n");
- return OFFLOAD_FAIL;
- }
-
- DP("Tgt unlock data (tgt:%016llx).\n",
- (long long unsigned)(Elf64_Addr)HostPtr);
- return OFFLOAD_SUCCESS;
-}
-
-} // extern "C"
diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt
deleted file mode 100644
index ebc4795aa647a0..00000000000000
--- a/openmp/libomptarget/plugins/common/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Common parts which can be used by all plugins
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(elf_common)
-add_subdirectory(MemoryManager)
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
deleted file mode 100644
index c4c460569361cd..00000000000000
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a CUDA machine if available.
-#
-##===----------------------------------------------------------------------===##
-set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
- "Whether to build CUDA plugin")
-if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
- libomptarget_say("Not building CUDA offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
- return()
-endif()
-
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
- libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
- return()
-endif()
-
-libomptarget_say("Building CUDA offloading plugin.")
-
-set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
-
-add_llvm_library(omptarget.rtl.cuda SHARED
- src/rtl.cpp
-
- LINK_COMPONENTS
- Support
- Object
-
- LINK_LIBS PRIVATE
- elf_common
- MemoryManager
- ${OPENMP_PTHREAD_LIB}
- "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs"
-
- NO_INSTALL_RPATH
-)
-
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
- libomptarget_say("Building CUDA plugin linked against libcuda")
- target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
-else()
- libomptarget_say("Building CUDA plugin for dlopened libcuda")
- target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
- target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
-endif()
-
-# Define the suffix for the runtime messaging dumps.
-target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA")
-target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES
- INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
- CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for CUDA.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libcuda is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no cuda available
-option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS)
- libomptarget_say("Enable tests using CUDA plugin")
- set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE)
- list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda")
- set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
- libomptarget_say("Not generating NVIDIA tests, no supported devices detected. Use 'LIBOMPTARGET_FORCE_NVIDIA_TESTS' to override.")
-endif()
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
deleted file mode 100644
index 871047494c27a4..00000000000000
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ /dev/null
@@ -1,1925 +0,0 @@
-//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for CUDA machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cuda.h>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "DeviceEnvironment.h"
-#include "omptarget.h"
-#include "omptargetplugin.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME CUDA
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-#include "MemoryManager.h"
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
-using namespace llvm;
-
-// Utility for retrieving and printing CUDA error string.
-#ifdef OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err) \
- do { \
- if (getDebugLevel() > 0) { \
- const char *errStr = nullptr; \
- CUresult errStr_status = cuGetErrorString(err, &errStr); \
- if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
- REPORT("Unrecognized CUDA error code: %d\n", err); \
- else if (errStr_status == CUDA_SUCCESS) \
- REPORT("CUDA error is: %s\n", errStr); \
- else { \
- REPORT("Unresolved CUDA error code: %d\n", err); \
- REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
- errStr_status); \
- } \
- } else { \
- const char *errStr = nullptr; \
- CUresult errStr_status = cuGetErrorString(err, &errStr); \
- if (errStr_status == CUDA_SUCCESS) \
- REPORT("%s \n", errStr); \
- } \
- } while (false)
-#else // OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err) \
- do { \
- const char *errStr = nullptr; \
- CUresult errStr_status = cuGetErrorString(err, &errStr); \
- if (errStr_status == CUDA_SUCCESS) \
- REPORT("%s \n", errStr); \
- } while (false)
-#endif // OMPTARGET_DEBUG
-
-#define BOOL2TEXT(b) ((b) ? "Yes" : "No")
-
-#include "elf_common.h"
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
- __tgt_target_table Table;
- std::vector<__tgt_offload_entry> Entries;
-};
-
-/// Use a single entity to encode a kernel and a set of flags.
-struct KernelTy {
- CUfunction Func;
-
- // execution mode of kernel
- llvm::omp::OMPTgtExecModeFlags ExecutionMode;
-
- /// Maximal number of threads per block for this kernel.
- int MaxThreadsPerBlock = 0;
-
- KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
- : Func(Func), ExecutionMode(ExecutionMode) {}
-};
-
-namespace {
-bool checkResult(CUresult Err, const char *ErrMsg) {
- if (Err == CUDA_SUCCESS)
- return true;
-
- REPORT("%s", ErrMsg);
- CUDA_ERR_STRING(Err);
- return false;
-}
-
-int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
- CUstream Stream) {
- CUresult Err =
- cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
-
- if (Err != CUDA_SUCCESS) {
- DP("Error when copying data from device to device. Pointers: src "
- "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
-}
-
-int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
- CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
- CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
- CUresult Err = cuEventRecord(Event, Stream);
- if (Err != CUDA_SUCCESS) {
- DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
- DPxPTR(Stream), DPxPTR(Event));
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
-}
-
-int syncEvent(void *EventPtr) {
- CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
- CUresult Err = cuEventSynchronize(Event);
- if (Err != CUDA_SUCCESS) {
- DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
-}
-
-namespace {
-
-// Structure contains per-device data
-struct DeviceDataTy {
- /// List that contains all the kernels.
- std::list<KernelTy> KernelsList;
-
- std::list<FuncOrGblEntryTy> FuncGblEntries;
-
- CUcontext Context = nullptr;
- // Device properties
- unsigned int ThreadsPerBlock = 0;
- unsigned int BlocksPerGrid = 0;
- unsigned int WarpSize = 0;
- // OpenMP properties
- unsigned int NumTeams = 0;
- unsigned int NumThreads = 0;
-};
-
-/// Resource allocator where \p T is the resource type.
-/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
-/// accordingly. The implementation should not raise any exception.
-template <typename T> struct AllocatorTy {
- using ElementTy = T;
- virtual ~AllocatorTy() {}
-
- /// Create a resource and assign to R.
- virtual int create(T &R) noexcept = 0;
- /// Destroy the resource.
- virtual int destroy(T) noexcept = 0;
-};
-
-/// Allocator for CUstream.
-struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
- /// See AllocatorTy<T>::create.
- int create(CUstream &Stream) noexcept override {
- if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
- "Error returned from cuStreamCreate\n"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-
- /// See AllocatorTy<T>::destroy.
- int destroy(CUstream Stream) noexcept override {
- if (!checkResult(cuStreamDestroy(Stream),
- "Error returned from cuStreamDestroy\n"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-};
-
-/// Allocator for CUevent.
-struct EventAllocatorTy final : public AllocatorTy<CUevent> {
- /// See AllocatorTy<T>::create.
- int create(CUevent &Event) noexcept override {
- if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
- "Error returned from cuEventCreate\n"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-
- /// See AllocatorTy<T>::destroy.
- int destroy(CUevent Event) noexcept override {
- if (!checkResult(cuEventDestroy(Event),
- "Error returned from cuEventDestroy\n"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-};
-
-/// A generic pool of resources where \p T is the resource type.
-/// \p T should be copyable as the object is stored in \p std::vector .
-template <typename AllocTy> class ResourcePoolTy {
- using ElementTy = typename AllocTy::ElementTy;
- /// Index of the next available resource.
- size_t Next = 0;
- /// Mutex to guard the pool.
- std::mutex Mutex;
- /// Pool of resources. The
diff erence between \p Resources and \p Pool is,
- /// when a resource is acquired and released, it is all on \p Resources. When
- /// a batch of new resources are needed, they are both added to \p Resources
- /// and \p Pool. The reason for this setting is, \p Resources could contain
- /// redundant elements because resources are not released, which can cause
- /// double free. This setting makes sure that \p Pool always has every
- /// resource allocated from the device.
- std::vector<ElementTy> Resources;
- std::vector<ElementTy> Pool;
- /// A reference to the corresponding allocator.
- AllocTy Allocator;
-
- /// If `Resources` is used up, we will fill in more resources. It assumes that
- /// the new size `Size` should be always larger than the current size.
- bool resize(size_t Size) {
- assert(Resources.size() == Pool.size() && "size mismatch");
- auto CurSize = Resources.size();
- assert(Size > CurSize && "Unexpected smaller size");
- Pool.reserve(Size);
- Resources.reserve(Size);
- for (auto I = CurSize; I < Size; ++I) {
- ElementTy NewItem;
- int Ret = Allocator.create(NewItem);
- if (Ret != OFFLOAD_SUCCESS)
- return false;
- Pool.push_back(NewItem);
- Resources.push_back(NewItem);
- }
- return true;
- }
-
-public:
- ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
- : Allocator(std::move(A)) {
- if (Size)
- (void)resize(Size);
- }
-
- ~ResourcePoolTy() noexcept { clear(); }
-
- /// Get a resource from pool. `Next` always points to the next available
- /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
- /// still available. If there is no resource left, we will ask for more. Each
- /// time a resource is assigned, the id will increase one.
- /// xxxxxs+++++++++
- /// ^
- /// Next
- /// After assignment, the pool becomes the following and s is assigned.
- /// xxxxxs+++++++++
- /// ^
- /// Next
- int acquire(ElementTy &R) noexcept {
- std::lock_guard<std::mutex> LG(Mutex);
- if (Next == Resources.size()) {
- auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
- if (!resize(NewSize))
- return OFFLOAD_FAIL;
- }
-
- assert(Next < Resources.size());
-
- R = Resources[Next++];
-
- return OFFLOAD_SUCCESS;
- }
-
- /// Return the resource back to the pool. When we return a resource, we need
- /// to first decrease `Next`, and then copy the resource back. It is worth
- /// noting that, the order of resources return might be
diff erent from that
- /// they're assigned, that saying, at some point, there might be two identical
- /// resources.
- /// xxax+a+++++
- /// ^
- /// Next
- /// However, it doesn't matter, because they're always on the two sides of
- /// `Next`. The left one will in the end be overwritten by another resource.
- /// Therefore, after several execution, the order of pool might be
diff erent
- /// from its initial state.
- void release(ElementTy R) noexcept {
- std::lock_guard<std::mutex> LG(Mutex);
- Resources[--Next] = R;
- }
-
- /// Released all stored resources and clear the pool.
- /// Note: This function is not thread safe. Be sure to guard it if necessary.
- void clear() noexcept {
- for (auto &R : Pool)
- (void)Allocator.destroy(R);
- Pool.clear();
- Resources.clear();
- }
-};
-
-} // namespace
-
-class DeviceRTLTy {
- int NumberOfDevices;
- // OpenMP environment properties
- int EnvNumTeams;
- unsigned int EnvTeamLimit;
- unsigned int EnvTeamThreadLimit;
- // OpenMP requires flags
- int64_t RequiresFlags;
- // Amount of dynamic shared memory to use at launch.
- uint64_t DynamicMemorySize;
-
- /// Number of initial streams for each device.
- int NumInitialStreams = 32;
-
- /// Number of initial events for each device.
- int NumInitialEvents = 8;
-
- static constexpr const int32_t HardThreadLimit = 1024;
- static constexpr const int32_t DefaultNumTeams = 128;
- static constexpr const int32_t DefaultNumThreads = 128;
-
- using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
- std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
-
- using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
- std::vector<std::unique_ptr<EventPoolTy>> EventPool;
-
- std::vector<DeviceDataTy> DeviceData;
- std::vector<std::vector<CUmodule>> Modules;
-
- /// Vector of flags indicating the initalization status of all associated
- /// devices.
- std::vector<bool> InitializedFlags;
-
- enum class PeerAccessState : uint8_t { Unkown, Yes, No };
- std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
- std::mutex PeerAccessMatrixLock;
-
- /// A class responsible for interacting with device native runtime library to
- /// allocate and free memory.
- class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
- public:
- void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
- if (Size == 0)
- return nullptr;
-
- void *MemAlloc = nullptr;
- CUresult Err;
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- CUdeviceptr DevicePtr;
- Err = cuMemAlloc(&DevicePtr, Size);
- MemAlloc = (void *)DevicePtr;
- if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
- return nullptr;
- break;
- case TARGET_ALLOC_HOST:
- void *HostPtr;
- Err = cuMemAllocHost(&HostPtr, Size);
- MemAlloc = HostPtr;
- if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
- return nullptr;
- break;
- case TARGET_ALLOC_SHARED:
- CUdeviceptr SharedPtr;
- Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
- MemAlloc = (void *)SharedPtr;
- if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
- return nullptr;
- break;
- }
-
- return MemAlloc;
- }
-
- int free(void *TgtPtr, TargetAllocTy Kind) override {
- CUresult Err;
- // Host pinned memory must be freed
diff erently.
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_SHARED:
- Err = cuMemFree((CUdeviceptr)TgtPtr);
- if (!checkResult(Err, "Error returned from cuMemFree\n"))
- return OFFLOAD_FAIL;
- break;
- case TARGET_ALLOC_HOST:
- Err = cuMemFreeHost(TgtPtr);
- if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
- return OFFLOAD_FAIL;
- break;
- }
-
- return OFFLOAD_SUCCESS;
- }
- };
-
- /// A vector of device allocators
- std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
-
- /// A vector of memory managers. Since the memory manager is non-copyable and
- // non-removable, we wrap them into std::unique_ptr.
- std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
-
- /// Whether use memory manager
- bool UseMemoryManager = true;
-
- // Record entry point associated with device
- void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
- FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
- E.Entries.push_back(Entry);
- }
-
- // Return a pointer to the entry associated with the pointer
- const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
- const void *Addr) const {
- for (const __tgt_offload_entry &Itr :
- DeviceData[DeviceId].FuncGblEntries.back().Entries)
- if (Itr.addr == Addr)
- return &Itr;
-
- return nullptr;
- }
-
- // Return the pointer to the target entries table
- __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
- FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-
- if (E.Entries.empty())
- return nullptr;
-
- // Update table info according to the entries and return the pointer
- E.Table.EntriesBegin = E.Entries.data();
- E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
-
- return &E.Table;
- }
-
- // Clear entries table for a device
- void clearOffloadEntriesTable(const int DeviceId) {
- DeviceData[DeviceId].FuncGblEntries.emplace_back();
- FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
- E.Entries.clear();
- E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
- }
-
-public:
- CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- if (!AsyncInfo->Queue) {
- CUstream S;
- if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
- return nullptr;
-
- AsyncInfo->Queue = S;
- }
-
- return reinterpret_cast<CUstream>(AsyncInfo->Queue);
- }
-
- // This class should not be copied
- DeviceRTLTy(const DeviceRTLTy &) = delete;
- DeviceRTLTy(DeviceRTLTy &&) = delete;
-
- DeviceRTLTy()
- : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
- EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
- DynamicMemorySize(0) {
-
- DP("Start initializing CUDA\n");
-
- CUresult Err = cuInit(0);
- if (Err == CUDA_ERROR_INVALID_HANDLE) {
- // Can't call cuGetErrorString if dlsym failed
- DP("Failed to load CUDA shared library\n");
- return;
- }
- if (Err == CUDA_ERROR_NO_DEVICE) {
- DP("There are no devices supporting CUDA.\n");
- return;
- }
- if (!checkResult(Err, "Error returned from cuInit\n")) {
- return;
- }
-
- Err = cuDeviceGetCount(&NumberOfDevices);
- if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
- return;
-
- if (NumberOfDevices == 0) {
- DP("There are no devices supporting CUDA.\n");
- return;
- }
-
- DeviceData.resize(NumberOfDevices);
- Modules.resize(NumberOfDevices);
- StreamPool.resize(NumberOfDevices);
- EventPool.resize(NumberOfDevices);
- PeerAccessMatrix.resize(NumberOfDevices);
- for (auto &V : PeerAccessMatrix)
- V.resize(NumberOfDevices, PeerAccessState::Unkown);
-
- // Get environment variables regarding teams
- if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
- // OMP_TEAM_LIMIT has been set
- EnvTeamLimit = std::stoi(EnvStr);
- DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
- }
- if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
- // OMP_TEAMS_THREAD_LIMIT has been set
- EnvTeamThreadLimit = std::stoi(EnvStr);
- DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
- }
- if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
- // OMP_NUM_TEAMS has been set
- EnvNumTeams = std::stoi(EnvStr);
- DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
- }
- if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
- // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
- DynamicMemorySize = std::stoi(EnvStr);
- DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
- DynamicMemorySize);
- }
- if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
- // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
- NumInitialStreams = std::stoi(EnvStr);
- DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
- }
-
- for (int I = 0; I < NumberOfDevices; ++I)
- DeviceAllocators.emplace_back();
-
- // Get the size threshold from environment variable
- std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
- UseMemoryManager = Res.second;
- size_t MemoryManagerThreshold = Res.first;
-
- if (UseMemoryManager)
- for (int I = 0; I < NumberOfDevices; ++I)
- MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
- DeviceAllocators[I], MemoryManagerThreshold));
-
- // We lazily initialize all devices later.
- InitializedFlags.assign(NumberOfDevices, false);
- }
-
- ~DeviceRTLTy() {
- for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
- deinitDevice(DeviceId);
- }
-
- // Check whether a given DeviceId is valid
- bool isValidDeviceId(const int DeviceId) const {
- return DeviceId >= 0 && DeviceId < NumberOfDevices;
- }
-
- int getNumOfDevices() const { return NumberOfDevices; }
-
- void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
-
- int initDevice(const int DeviceId) {
- CUdevice Device;
-
- DP("Getting device %d\n", DeviceId);
- CUresult Err = cuDeviceGet(&Device, DeviceId);
- if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
- return OFFLOAD_FAIL;
-
- assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
- InitializedFlags[DeviceId] = true;
-
- // Query the current flags of the primary context and set its flags if
- // it is inactive
- unsigned int FormerPrimaryCtxFlags = 0;
- int FormerPrimaryCtxIsActive = 0;
- Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
- &FormerPrimaryCtxIsActive);
- if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
- return OFFLOAD_FAIL;
-
- if (FormerPrimaryCtxIsActive) {
- DP("The primary context is active, no change to its flags\n");
- if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
- CU_CTX_SCHED_BLOCKING_SYNC)
- DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
- } else {
- DP("The primary context is inactive, set its flags to "
- "CU_CTX_SCHED_BLOCKING_SYNC\n");
- Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
- if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
- return OFFLOAD_FAIL;
- }
-
- // Retain the per device primary context and save it to use whenever this
- // device is selected.
- Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
- if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
- return OFFLOAD_FAIL;
-
- Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
- if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
- return OFFLOAD_FAIL;
-
- // Initialize the stream pool.
- if (!StreamPool[DeviceId])
- StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
- NumInitialStreams);
-
- // Initialize the event pool.
- if (!EventPool[DeviceId])
- EventPool[DeviceId] =
- std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
-
- // Query attributes to determine number of threads/block and blocks/grid.
- int MaxGridDimX;
- Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
- Device);
- if (Err != CUDA_SUCCESS) {
- DP("Error getting max grid dimension, use default value %d\n",
- DeviceRTLTy::DefaultNumTeams);
- DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
- } else {
- DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
- DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
- }
-
- // We are only exploiting threads along the x axis.
- int MaxBlockDimX;
- Err = cuDeviceGetAttribute(&MaxBlockDimX,
- CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
- if (Err != CUDA_SUCCESS) {
- DP("Error getting max block dimension, use default value %d\n",
- DeviceRTLTy::DefaultNumThreads);
- DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
- } else {
- DP("Using %d CUDA threads per block\n", MaxBlockDimX);
- DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
-
- if (EnvTeamThreadLimit > 0 &&
- DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
- DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
- "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
- DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
- DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
- }
- if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
- DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
- "capping at the hard limit\n",
- DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
- DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
- }
- }
-
- // Get and set warp size
- int WarpSize;
- Err =
- cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
- if (Err != CUDA_SUCCESS) {
- DP("Error getting warp size, assume default value 32\n");
- DeviceData[DeviceId].WarpSize = 32;
- } else {
- DP("Using warp size %d\n", WarpSize);
- DeviceData[DeviceId].WarpSize = WarpSize;
- }
-
- // Adjust teams to the env variables
- if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
- DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
- EnvTeamLimit);
- DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
- }
-
- size_t StackLimit;
- size_t HeapLimit;
- if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
- StackLimit = std::stol(EnvStr);
- if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
- return OFFLOAD_FAIL;
- } else {
- if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
- return OFFLOAD_FAIL;
- }
- if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
- HeapLimit = std::stol(EnvStr);
- if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
- return OFFLOAD_FAIL;
- } else {
- if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
- return OFFLOAD_FAIL;
- }
-
- INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
- "Device supports up to %d CUDA blocks and %d threads with a "
- "warp size of %d\n",
- DeviceData[DeviceId].BlocksPerGrid,
- DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
- INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
- "Device heap size is %d Bytes, device stack size is %d Bytes per "
- "thread\n",
- (int)HeapLimit, (int)StackLimit);
-
- // Set default number of teams
- if (EnvNumTeams > 0) {
- DP("Default number of teams set according to environment %d\n",
- EnvNumTeams);
- DeviceData[DeviceId].NumTeams = EnvNumTeams;
- } else {
- DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
- DP("Default number of teams set according to library's default %d\n",
- DeviceRTLTy::DefaultNumTeams);
- }
-
- if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
- DP("Default number of teams exceeds device limit, capping at %d\n",
- DeviceData[DeviceId].BlocksPerGrid);
- DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
- }
-
- // Set default number of threads
- DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
- DP("Default number of threads set according to library's default %d\n",
- DeviceRTLTy::DefaultNumThreads);
- if (DeviceData[DeviceId].NumThreads >
- DeviceData[DeviceId].ThreadsPerBlock) {
- DP("Default number of threads exceeds device limit, capping at %d\n",
- DeviceData[DeviceId].ThreadsPerBlock);
- DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
- }
-
- return OFFLOAD_SUCCESS;
- }
-
- int deinitDevice(const int DeviceId) {
- auto IsInitialized = InitializedFlags[DeviceId];
- if (!IsInitialized)
- return OFFLOAD_SUCCESS;
- InitializedFlags[DeviceId] = false;
-
- if (UseMemoryManager)
- MemoryManagers[DeviceId].release();
-
- StreamPool[DeviceId].reset();
- EventPool[DeviceId].reset();
-
- DeviceDataTy &D = DeviceData[DeviceId];
- if (!checkResult(cuCtxSetCurrent(D.Context),
- "Error returned from cuCtxSetCurrent\n"))
- return OFFLOAD_FAIL;
-
- // Unload all modules.
- for (auto &M : Modules[DeviceId])
- if (!checkResult(cuModuleUnload(M),
- "Error returned from cuModuleUnload\n"))
- return OFFLOAD_FAIL;
-
- // Destroy context.
- CUdevice Device;
- if (!checkResult(cuCtxGetDevice(&Device),
- "Error returned from cuCtxGetDevice\n"))
- return OFFLOAD_FAIL;
-
- if (!checkResult(cuDevicePrimaryCtxRelease(Device),
- "Error returned from cuDevicePrimaryCtxRelease\n"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-
- __tgt_target_table *loadBinary(const int DeviceId,
- const __tgt_device_image *Image) {
- // Clear the offload table as we are going to create a new one.
- clearOffloadEntriesTable(DeviceId);
-
- // Create the module and extract the function pointers.
- CUmodule Module;
- DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
- CUresult Err =
- cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
- if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
- return nullptr;
-
- DP("CUDA module successfully loaded!\n");
-
- Modules[DeviceId].push_back(Module);
-
- // Find the symbols in the module by name.
- const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
- const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
- std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
- for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
- if (!E->addr) {
- // We return nullptr when something like this happens, the host should
- // have always something in the address to uniquely identify the target
- // region.
- DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
- return nullptr;
- }
-
- if (E->size) {
- __tgt_offload_entry Entry = *E;
- CUdeviceptr CUPtr;
- size_t CUSize;
- Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
- // We keep this style here because we need the name
- if (Err != CUDA_SUCCESS) {
- REPORT("Loading global '%s' Failed\n", E->name);
- CUDA_ERR_STRING(Err);
- return nullptr;
- }
-
- if (CUSize != E->size) {
- DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
- CUSize, E->size);
- return nullptr;
- }
-
- DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
- DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
-
- Entry.addr = (void *)(CUPtr);
-
- // Note: In the current implementation declare target variables
- // can either be link or to. This means that once unified
- // memory is activated via the requires directive, the variable
- // can be used directly from the host in both cases.
- // TODO: when variables types other than to or link are added,
- // the below condition should be changed to explicitly
- // check for to and link variables types:
- // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
- // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
- if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
- // If unified memory is present any target link or to variables
- // can access host addresses directly. There is no longer a
- // need for device copies.
- cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
- DP("Copy linked variable host address (" DPxMOD
- ") to device address (" DPxMOD ")\n",
- DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
- }
-
- addOffloadEntry(DeviceId, Entry);
-
- continue;
- }
-
- CUfunction Func;
- Err = cuModuleGetFunction(&Func, Module, E->name);
- // We keep this style here because we need the name
- if (Err != CUDA_SUCCESS) {
- REPORT("Loading '%s' Failed\n", E->name);
- CUDA_ERR_STRING(Err);
- return nullptr;
- }
-
- DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
- DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
-
- // default value GENERIC (in case symbol is missing from cubin file)
- llvm::omp::OMPTgtExecModeFlags ExecModeVal;
- std::string ExecModeNameStr(E->name);
- ExecModeNameStr += "_exec_mode";
- const char *ExecModeName = ExecModeNameStr.c_str();
-
- CUdeviceptr ExecModePtr;
- size_t CUSize;
- Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
- if (Err == CUDA_SUCCESS) {
- if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
- DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
- ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
- return nullptr;
- }
-
- Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
- if (Err != CUDA_SUCCESS) {
- REPORT("Error when copying data from device to host. Pointers: "
- "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
- DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
- CUDA_ERR_STRING(Err);
- return nullptr;
- }
- } else {
- DP("Loading global exec_mode '%s' - symbol missing, using default "
- "value GENERIC (1)\n",
- ExecModeName);
- }
-
- KernelsList.emplace_back(Func, ExecModeVal);
-
- __tgt_offload_entry Entry = *E;
- Entry.addr = &KernelsList.back();
- addOffloadEntry(DeviceId, Entry);
- }
-
- // send device environment data to the device
- {
- // TODO: The device ID used here is not the real device ID used by OpenMP.
- DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
- static_cast<uint32_t>(DeviceId),
- static_cast<uint32_t>(DynamicMemorySize)};
-
- if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
- DeviceEnv.DebugKind = std::stoi(EnvStr);
-
- const char *DeviceEnvName = "__omp_rtl_device_environment";
- CUdeviceptr DeviceEnvPtr;
- size_t CUSize;
-
- Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
- if (Err == CUDA_SUCCESS) {
- if (CUSize != sizeof(DeviceEnv)) {
- REPORT(
- "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
- DeviceEnvName, CUSize, sizeof(int32_t));
- CUDA_ERR_STRING(Err);
- return nullptr;
- }
-
- Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
- if (Err != CUDA_SUCCESS) {
- REPORT("Error when copying data from host to device. Pointers: "
- "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
- DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
- CUDA_ERR_STRING(Err);
- return nullptr;
- }
-
- DP("Sending global device environment data %zu bytes\n", CUSize);
- } else {
- DP("Finding global device environment '%s' - symbol missing.\n",
- DeviceEnvName);
- DP("Continue, considering this is a device RTL which does not accept "
- "environment setting.\n");
- }
- }
-
- return getOffloadEntriesTable(DeviceId);
- }
-
- void *dataAlloc(const int DeviceId, const int64_t Size,
- const TargetAllocTy Kind) {
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- if (UseMemoryManager)
- return MemoryManagers[DeviceId]->allocate(Size, nullptr);
- else
- return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
- case TARGET_ALLOC_HOST:
- case TARGET_ALLOC_SHARED:
- return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
- }
-
- REPORT("Invalid target data allocation kind or requested allocator not "
- "implemented yet\n");
-
- return nullptr;
- }
-
- int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
- const int64_t Size, __tgt_async_info *AsyncInfo) const {
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- CUstream Stream = getStream(DeviceId, AsyncInfo);
- CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
- if (Err != CUDA_SUCCESS) {
- DP("Error when copying data from host to device. Pointers: host "
- "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
- }
-
- int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
- const int64_t Size, __tgt_async_info *AsyncInfo) const {
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- CUstream Stream = getStream(DeviceId, AsyncInfo);
- CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
- if (Err != CUDA_SUCCESS) {
- DP("Error when copying data from device to host. Pointers: host "
- "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
- DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
- }
-
- int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
- int64_t Size, __tgt_async_info *AsyncInfo) {
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- CUresult Err;
- CUstream Stream = getStream(SrcDevId, AsyncInfo);
-
- // If they are two devices, we try peer to peer copy first
- if (SrcDevId != DstDevId) {
- std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
-
- switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
- case PeerAccessState::No: {
- REPORT("Peer access from %" PRId32 " to %" PRId32
- " is not supported. Fall back to D2D memcpy.\n",
- SrcDevId, DstDevId);
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
- case PeerAccessState::Unkown: {
- int CanAccessPeer = 0;
- Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
- if (Err != CUDA_SUCCESS) {
- REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
- ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
- SrcDevId, DstDevId);
- CUDA_ERR_STRING(Err);
- PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
-
- if (!CanAccessPeer) {
- REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
- "memcpy.\n",
- SrcDevId, DstDevId);
- PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
-
- Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
- if (Err != CUDA_SUCCESS) {
- REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
- ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
- SrcDevId, DstDevId);
- CUDA_ERR_STRING(Err);
- PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
-
- PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
-
- [[fallthrough]];
- }
- case PeerAccessState::Yes: {
- Err = cuMemcpyPeerAsync(
- (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
- (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
- if (Err == CUDA_SUCCESS)
- return OFFLOAD_SUCCESS;
-
- DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
- ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
- ". Fall back to D2D memcpy.\n",
- DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
- CUDA_ERR_STRING(Err);
-
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
- }
- }
-
- return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
- }
-
- int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) {
- switch (Kind) {
- case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE:
- if (UseMemoryManager)
- return MemoryManagers[DeviceId]->free(TgtPtr);
- else
- return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
- case TARGET_ALLOC_HOST:
- case TARGET_ALLOC_SHARED:
- return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
- }
-
- REPORT("Invalid target data allocation kind or requested allocator not "
- "implemented yet\n");
-
- return OFFLOAD_FAIL;
- }
-
- int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
- ptr
diff _t *TgtOffsets, const int ArgNum,
- const int TeamNum, const int ThreadLimit,
- const unsigned int LoopTripCount,
- __tgt_async_info *AsyncInfo) const {
- // All args are references.
- std::vector<void *> Args(ArgNum);
- std::vector<void *> Ptrs(ArgNum);
-
- for (int I = 0; I < ArgNum; ++I) {
- Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
- Args[I] = &Ptrs[I];
- }
-
- KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
-
- const bool IsSPMDGenericMode =
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
- const bool IsSPMDMode =
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
- const bool IsGenericMode =
- KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
-
- int CudaThreadsPerBlock;
- if (ThreadLimit > 0) {
- DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
- CudaThreadsPerBlock = ThreadLimit;
- // Add master warp if necessary
- if (IsGenericMode) {
- DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
- CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
- }
- } else {
- DP("Setting CUDA threads per block to default %d\n",
- DeviceData[DeviceId].NumThreads);
- CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
- }
-
- if ((unsigned)CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
- DP("Threads per block capped at device limit %d\n",
- DeviceData[DeviceId].ThreadsPerBlock);
- CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
- }
-
- CUresult Err;
- if (!KernelInfo->MaxThreadsPerBlock) {
- Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
- CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
- KernelInfo->Func);
- if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
- return OFFLOAD_FAIL;
- }
-
- if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
- DP("Threads per block capped at kernel limit %d\n",
- KernelInfo->MaxThreadsPerBlock);
- CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
- }
-
- unsigned int CudaBlocksPerGrid;
- if (TeamNum <= 0) {
- if (LoopTripCount > 0 && EnvNumTeams < 0) {
- if (IsSPMDGenericMode) {
- // If we reach this point, then we are executing a kernel that was
- // transformed from Generic-mode to SPMD-mode. This kernel has
- // SPMD-mode execution, but needs its blocks to be scheduled
- //
diff erently because the current loop trip count only applies to the
- // `teams distribute` region and will create var too few blocks using
- // the regular SPMD-mode method.
- CudaBlocksPerGrid = LoopTripCount;
- } else if (IsSPMDMode) {
- // We have a combined construct, i.e. `target teams distribute
- // parallel for [simd]`. We launch so many teams so that each thread
- // will execute one iteration of the loop. round up to the nearest
- // integer
- CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
- } else if (IsGenericMode) {
- // If we reach this point, then we have a non-combined construct, i.e.
- // `teams distribute` with a nested `parallel for` and each team is
- // assigned one iteration of the `distribute` loop. E.g.:
- //
- // #pragma omp target teams distribute
- // for(...loop_tripcount...) {
- // #pragma omp parallel for
- // for(...) {}
- // }
- //
- // Threads within a team will execute the iterations of the `parallel`
- // loop.
- CudaBlocksPerGrid = LoopTripCount;
- } else {
- REPORT("Unknown execution mode: %d\n",
- static_cast<int8_t>(KernelInfo->ExecutionMode));
- return OFFLOAD_FAIL;
- }
- DP("Using %d teams due to loop trip count %" PRIu32
- " and number of threads per block %d\n",
- CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
- } else {
- DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
- CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
- }
- } else {
- DP("Using requested number of teams %d\n", TeamNum);
- CudaBlocksPerGrid = TeamNum;
- }
-
- if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
- DP("Capping number of teams to team limit %d\n",
- DeviceData[DeviceId].BlocksPerGrid);
- CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
- }
-
- INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
- "Launching kernel %s with %d blocks and %d threads in %s mode\n",
- (getOffloadEntry(DeviceId, TgtEntryPtr))
- ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
- : "(null)",
- CudaBlocksPerGrid, CudaThreadsPerBlock,
- (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
-
- CUstream Stream = getStream(DeviceId, AsyncInfo);
- Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
- /* gridDimZ */ 1, CudaThreadsPerBlock,
- /* blockDimY */ 1, /* blockDimZ */ 1,
- DynamicMemorySize, Stream, &Args[0], nullptr);
- if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
- return OFFLOAD_FAIL;
-
- DP("Launch of entry point at " DPxMOD " successful!\n",
- DPxPTR(TgtEntryPtr));
-
- return OFFLOAD_SUCCESS;
- }
-
- int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
- CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
- CUresult Err = cuStreamSynchronize(Stream);
-
- // Once the stream is synchronized, return it to stream pool and reset
- // AsyncInfo. This is to make sure the synchronization only works for its
- // own tasks.
- StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
- AsyncInfo->Queue = nullptr;
-
- if (Err != CUDA_SUCCESS) {
- DP("Error when synchronizing stream. stream = " DPxMOD
- ", async info ptr = " DPxMOD "\n",
- DPxPTR(Stream), DPxPTR(AsyncInfo));
- CUDA_ERR_STRING(Err);
- }
- return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
- }
-
- int queryAsync(const int DeviceId, __tgt_async_info *AsyncInfo) const {
- CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
- CUresult Err = cuStreamQuery(Stream);
-
- // Not ready streams must be considered as successful operations.
- if (Err == CUDA_ERROR_NOT_READY)
- return OFFLOAD_SUCCESS;
-
- // Once the stream is synchronized or an error occurs, return it to the
- // stream pool and reset AsyncInfo. This is to make sure the
- // synchronization only works for its own tasks.
- StreamPool[DeviceId]->release(Stream);
- AsyncInfo->Queue = nullptr;
-
- if (Err != CUDA_SUCCESS) {
- DP("Error when querying for stream progress. stream = " DPxMOD
- ", async info ptr = " DPxMOD "\n",
- DPxPTR(Stream), DPxPTR(AsyncInfo));
- CUDA_ERR_STRING(Err);
- }
- return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
- }
-
- void printDeviceInfo(int32_t DeviceId) {
- char TmpChar[1000];
- std::string TmpStr;
- size_t TmpSt;
- int TmpInt, TmpInt2, TmpInt3;
-
- CUdevice Device;
- checkResult(cuDeviceGet(&Device, DeviceId),
- "Error returned from cuCtxGetDevice\n");
-
- cuDriverGetVersion(&TmpInt);
- printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
- printf(" CUDA Device Number: \t\t%d \n", DeviceId);
- checkResult(cuDeviceGetName(TmpChar, 1000, Device),
- "Error returned from cuDeviceGetName\n");
- printf(" Device Name: \t\t\t%s \n", TmpChar);
- checkResult(cuDeviceTotalMem(&TmpSt, Device),
- "Error returned from cuDeviceTotalMem\n");
- printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Registers per Block: \t\t%d \n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
- "Error returned from cuDeviceGetAttribute\n");
- checkResult(cuDeviceGetAttribute(
- &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
- "Error returned from cuDeviceGetAttribute\n");
- checkResult(cuDeviceGetAttribute(
- &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
- TmpInt3);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
- "Error returned from cuDeviceGetAttribute\n");
- checkResult(cuDeviceGetAttribute(
- &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
- "Error returned from cuDeviceGetAttribute\n");
- checkResult(cuDeviceGetAttribute(
- &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
- TmpInt3);
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
- "Error returned from cuDeviceGetAttribute\n");
- if (TmpInt == CU_COMPUTEMODE_DEFAULT)
- TmpStr = "DEFAULT";
- else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
- TmpStr = "PROHIBITED";
- else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
- TmpStr = "EXCLUSIVE PROCESS";
- else
- TmpStr = "unknown";
- printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
- checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
- Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
- Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
- "Error returned from cuDeviceGetAttribute\n");
- checkResult(
- cuDeviceGetAttribute(
- &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
- "Error returned from cuDeviceGetAttribute\n");
- printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
- }
-
- int createEvent(int DeviceId, void **P) {
- CUevent Event = nullptr;
- if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
- *P = Event;
- return OFFLOAD_SUCCESS;
- }
-
- int destroyEvent(int DeviceId, void *EventPtr) {
- EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
- return OFFLOAD_SUCCESS;
- }
-
- int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
- void *EventPtr) const {
- CUstream Stream = getStream(DeviceId, AsyncInfo);
- CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
- // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
- // specific CUDA version, and defined as 0x0. In previous version, per CUDA
- // API document, that argument has to be 0x0.
- CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
- if (Err != CUDA_SUCCESS) {
- DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
- DPxPTR(Stream), DPxPTR(Event));
- CUDA_ERR_STRING(Err);
- return OFFLOAD_FAIL;
- }
-
- return OFFLOAD_SUCCESS;
- }
-
- int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
- if (AsyncInfo->Queue) {
- StreamPool[DeviceId]->release(
- reinterpret_cast<CUstream>(AsyncInfo->Queue));
- AsyncInfo->Queue = nullptr;
- }
-
- return OFFLOAD_SUCCESS;
- }
-
- int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
- *AsyncInfo = new __tgt_async_info;
- getStream(DeviceId, *AsyncInfo);
- return OFFLOAD_SUCCESS;
- }
-
- int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
- const char **ErrStr) const {
- assert(DeviceInfo && "DeviceInfo is nullptr");
-
- if (!DeviceInfo->Context)
- DeviceInfo->Context = DeviceData[DeviceId].Context;
- if (!DeviceInfo->Device) {
- CUdevice Dev;
- CUresult Err = cuDeviceGet(&Dev, DeviceId);
- if (Err == CUDA_SUCCESS) {
- DeviceInfo->Device = reinterpret_cast<void *>(Dev);
- } else {
- cuGetErrorString(Err, ErrStr);
- return OFFLOAD_FAIL;
- }
- }
- return OFFLOAD_SUCCESS;
- }
-
- int setContext(int DeviceId) {
- assert(InitializedFlags[DeviceId] && "Device is not initialized");
-
- CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
- if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
- return OFFLOAD_FAIL;
-
- return OFFLOAD_SUCCESS;
- }
-};
-
-DeviceRTLTy DeviceRTL;
-} // namespace
-
-// Exposed library API function
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
- return elf_check_machine(Image, /* EM_CUDA */ 190);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image,
- __tgt_image_info *Info) {
- if (!__tgt_rtl_is_valid_binary(Image))
- return false;
-
- // A subarchitecture was not specified. Assume it is compatible.
- if (!Info || !Info->Arch)
- return true;
-
- int32_t NumberOfDevices = 0;
- if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
- return false;
-
- StringRef ArchStr = StringRef(Info->Arch).drop_front(sizeof("sm_") - 1);
- for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
- CUdevice Device;
- if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
- return false;
-
- int32_t Major, Minor;
- if (cuDeviceGetAttribute(&Major,
- CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- Device) != CUDA_SUCCESS)
- return false;
- if (cuDeviceGetAttribute(&Minor,
- CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
- Device) != CUDA_SUCCESS)
- return false;
-
- // A cubin generated for a certain compute capability is supported to run on
- // any GPU with the same major revision and same or higher minor revision.
- int32_t ImageMajor = ArchStr[0] - '0';
- int32_t ImageMinor = ArchStr[1] - '0';
- if (Major != ImageMajor || Minor < ImageMinor)
- return false;
- }
-
- DP("Image has compatible compute capability: %s\n", Info->Arch);
- return true;
-}
-
-int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
- DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
- DeviceRTL.setRequiresFlag(RequiresFlags);
- return RequiresFlags;
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
- if (DeviceRTL.isValidDeviceId(SrcDevId) &&
- DeviceRTL.isValidDeviceId(DstDevId))
- return 1;
-
- return 0;
-}
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set when init the device.
-
- return DeviceRTL.initDevice(DeviceId);
-}
-
-int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set when deinit the device.
-
- return DeviceRTL.deinitDevice(DeviceId);
-}
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
- __tgt_device_image *Image) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return nullptr;
-
- return DeviceRTL.loadBinary(DeviceId, Image);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
- int32_t Kind) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return nullptr;
-
- return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set in __tgt_rtl_data_submit_async.
-
- __tgt_async_info AsyncInfo;
- const int32_t Rc =
- __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
- void *HstPtr, int64_t Size,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set in __tgt_rtl_data_retrieve_async.
-
- __tgt_async_info AsyncInfo;
- const int32_t Rc =
- __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
- void *TgtPtr, int64_t Size,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
- int DstDevId, void *DstPtr, int64_t Size,
- __tgt_async_info *AsyncInfo) {
- assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
- assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
- assert(AsyncInfo && "AsyncInfo is nullptr");
-
- if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
- AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
- int32_t DstDevId, void *DstPtr, int64_t Size) {
- assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
- assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
- // Context is set in __tgt_rtl_data_exchange_async.
-
- __tgt_async_info AsyncInfo;
- const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
- DstPtr, Size, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- int32_t ArgNum, int32_t TeamNum,
- int32_t ThreadLimit,
- uint64_t LoopTripcount) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set in __tgt_rtl_run_target_team_region_async.
-
- __tgt_async_info AsyncInfo;
- const int32_t Rc = __tgt_rtl_run_target_team_region_async(
- DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
- LoopTripcount, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_team_region_async(
- int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptr
diff _t *TgtOffsets,
- int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
- uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
- TgtOffsets, ArgNum, TeamNum, ThreadLimit,
- LoopTripcount, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- int32_t ArgNum) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set in __tgt_rtl_run_target_region_async.
-
- __tgt_async_info AsyncInfo;
- const int32_t Rc = __tgt_rtl_run_target_region_async(
- DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
- if (Rc != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- int32_t ArgNum,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // Context is set in __tgt_rtl_run_target_team_region_async.
- return __tgt_rtl_run_target_team_region_async(
- DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
- /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
- AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
- assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
- // NOTE: We don't need to set context for stream sync.
- return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_query_async(int32_t DeviceId,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
- assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
- // NOTE: We don't need to set context for stream query.
- return DeviceRTL.queryAsync(DeviceId, AsyncInfoPtr);
-}
-
-void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
- std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
- InfoLevel.store(NewInfoLevel);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- // NOTE: We don't need to set context for print device info.
- DeviceRTL.printDeviceInfo(DeviceId);
-}
-
-int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
- assert(Event && "event is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.createEvent(DeviceId, Event);
-}
-
-int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
- __tgt_async_info *AsyncInfoPtr) {
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
- assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
- assert(EventPtr && "event_ptr is nullptr");
- // NOTE: We might not need to set context for event record.
- return recordEvent(EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
- __tgt_async_info *AsyncInfoPtr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfoPtr && "async_info_ptr is nullptr");
- assert(EventPtr && "event is nullptr");
- // If we don't have a queue we need to set the context.
- if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
- return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
-}
-
-int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
- assert(EventPtr && "event is nullptr");
- // NOTE: We might not need to set context for event sync.
- return syncEvent(EventPtr);
-}
-
-int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
- assert(EventPtr && "event is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.destroyEvent(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
- __tgt_async_info *AsyncInfo) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfo && "async_info is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
- __tgt_async_info **AsyncInfo) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(AsyncInfo && "async_info is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
- __tgt_device_info *DeviceInfoPtr,
- const char **ErrStr) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
- assert(DeviceInfoPtr && "device_info_ptr is nullptr");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- KernelArgsTy *KernelArgs,
- __tgt_async_info *AsyncInfo) {
- assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
- if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
-
- return DeviceRTL.runTargetTeamRegion(
- DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs->NumArgs,
- KernelArgs->NumTeams[0], KernelArgs->ThreadLimit[0],
- KernelArgs->Tripcount, AsyncInfo);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
deleted file mode 100644
index cc7beda183afa8..00000000000000
--- a/openmp/libomptarget/plugins/exports
+++ /dev/null
@@ -1,6 +0,0 @@
-VERS1.0 {
- global:
- __tgt_rtl*;
- local:
- *;
-};
diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
deleted file mode 100644
index 3fba82c3ae12ce..00000000000000
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for generic 64-bit machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ffi.h>
-#include <link.h>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "omptargetplugin.h"
-
-using namespace llvm;
-using namespace llvm::sys;
-
-#ifndef TARGET_NAME
-#define TARGET_NAME Generic ELF - 64bit
-#endif
-#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
-
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID 0
-#endif
-
-#include "elf_common.h"
-
-#define NUMBER_OF_DEVICES 4
-#define OFFLOAD_SECTION_NAME "omp_offloading_entries"
-
-/// Array of Dynamic libraries loaded for this target.
-struct DynLibTy {
- std::string FileName;
- std::unique_ptr<DynamicLibrary> DynLib;
-};
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
- __tgt_target_table Table;
- SmallVector<__tgt_offload_entry> Entries;
-};
-
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
- std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-public:
- std::list<DynLibTy> DynLibs;
-
- // Record entry point associated with device.
- void createOffloadTable(int32_t DeviceId,
- SmallVector<__tgt_offload_entry> &&Entries) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncGblEntries[DeviceId].emplace_back();
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- E.Entries = Entries;
- E.Table.EntriesBegin = E.Entries.begin();
- E.Table.EntriesEnd = E.Entries.end();
- }
-
- // Return true if the entry is associated with device.
- bool findOffloadEntry(int32_t DeviceId, void *Addr) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- for (__tgt_offload_entry *I = E.Table.EntriesBegin,
- *End = E.Table.EntriesEnd;
- I < End; ++I) {
- if (I->addr == Addr)
- return true;
- }
-
- return false;
- }
-
- // Return the pointer to the target entries table.
- __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
- assert(DeviceId < (int32_t)FuncGblEntries.size() &&
- "Unexpected device id!");
- FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
- return &E.Table;
- }
-
- RTLDeviceInfoTy(int32_t NumDevices) { FuncGblEntries.resize(NumDevices); }
-
- ~RTLDeviceInfoTy() {
- // Close dynamic libraries
- for (auto &Lib : DynLibs) {
- if (Lib.DynLib->isValid())
- remove(Lib.FileName.c_str());
- }
- }
-};
-
-static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-// If we don't have a valid ELF ID we can just fail.
-#if TARGET_ELF_ID < 1
- return 0;
-#else
- return elf_check_machine(Image, TARGET_ELF_ID);
-#endif
-}
-
-int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; }
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
- __tgt_device_image *Image) {
-
- DP("Dev %d: load binary from " DPxMOD " image\n", DeviceId,
- DPxPTR(Image->ImageStart));
-
- assert(DeviceId >= 0 && DeviceId < NUMBER_OF_DEVICES && "bad dev id");
-
- size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
-
- // load dynamic library and get the entry points. We use the dl library
- // to do the loading of the library, but we could do it directly to avoid the
- // dump to the temporary file.
- //
- // 1) Create tmp file with the library contents.
- // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
- char TmpName[] = "/tmp/tmpfile_XXXXXX";
- int TmpFd = mkstemp(TmpName);
-
- if (TmpFd == -1)
- return nullptr;
-
- FILE *Ftmp = fdopen(TmpFd, "wb");
-
- if (!Ftmp)
- return nullptr;
-
- fwrite(Image->ImageStart, ImageSize, 1, Ftmp);
- fclose(Ftmp);
-
- std::string ErrMsg;
- auto DynLib = std::make_unique<sys::DynamicLibrary>(
- sys::DynamicLibrary::getPermanentLibrary(TmpName, &ErrMsg));
- DynLibTy Lib = {TmpName, std::move(DynLib)};
-
- if (!Lib.DynLib->isValid()) {
- DP("Target library loading error: %s\n", ErrMsg.c_str());
- return NULL;
- }
-
- __tgt_offload_entry *HostBegin = Image->EntriesBegin;
- __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
- // Create a new offloading entry list using the device symbol address.
- SmallVector<__tgt_offload_entry> Entries;
- for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
- if (!E->addr)
- return nullptr;
-
- __tgt_offload_entry Entry = *E;
-
- void *DevAddr = Lib.DynLib->getAddressOfSymbol(E->name);
- Entry.addr = DevAddr;
-
- DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
- DPxPTR(E - HostBegin), E->name, DPxPTR(DevAddr));
-
- Entries.emplace_back(Entry);
- }
-
- DeviceInfo.createOffloadTable(DeviceId, std::move(Entries));
- DeviceInfo.DynLibs.emplace_back(std::move(Lib));
-
- return DeviceInfo.getOffloadEntriesTable(DeviceId);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
- printf(" This is a generic-elf-64bit device\n");
-}
-
-// Sample implementation of explicit memory allocator. For this plugin all kinds
-// are equivalent to each other.
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
- int32_t Kind) {
- void *Ptr = NULL;
-
- switch (Kind) {
- case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_HOST:
- case TARGET_ALLOC_SHARED:
- case TARGET_ALLOC_DEFAULT:
- Ptr = malloc(Size);
- break;
- default:
- REPORT("Invalid target data allocation kind");
- }
-
- return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size) {
- memcpy(TgtPtr, HstPtr, Size);
- return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size) {
- memcpy(HstPtr, TgtPtr, Size);
- return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
- free(TgtPtr);
- return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
- void **TgtArgs, ptr
diff _t *TgtOffsets,
- KernelArgsTy *KernelArgs,
- __tgt_async_info *AsyncInfoPtr) {
- assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
- !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
- "Only one dimensional kernels supported.");
- // ignore team num and thread limit.
-
- // Use libffi to launch execution.
- ffi_cif Cif;
-
- // All args are references.
- std::vector<ffi_type *> ArgsTypes(KernelArgs->NumArgs, &ffi_type_pointer);
- std::vector<void *> Args(KernelArgs->NumArgs);
- std::vector<void *> Ptrs(KernelArgs->NumArgs);
-
- for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
- Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
- Args[I] = &Ptrs[I];
- }
-
- ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs->NumArgs,
- &ffi_type_void, &ArgsTypes[0]);
-
- assert(Status == FFI_OK && "Unable to prepare target launch!");
-
- if (Status != FFI_OK)
- return OFFLOAD_FAIL;
-
- DP("Running entry point at " DPxMOD "...\n", DPxPTR(TgtEntryPtr));
-
- void (*Entry)(void);
- *((void **)&Entry) = TgtEntryPtr;
- ffi_call(&Cif, Entry, NULL, &Args[0]);
- return OFFLOAD_SUCCESS;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
deleted file mode 100644
index 793bf0665c5d82..00000000000000
--- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
deleted file mode 100644
index cc8a50f1f59ec9..00000000000000
--- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64le machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
deleted file mode 100644
index d3e78ffb8646ef..00000000000000
--- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a x86_64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
-else()
- libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index d3e600683d601d..6fe7b4e9d414f1 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -99,8 +99,6 @@ void RTLsTy::loadRTLs() {
DP("Loading RTLs...\n");
- BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", true);
-
// Attempt to open all the plugins and, if they exist, check if the interface
// is correct and if they are supporting any devices.
for (const char *Name : RTLNames) {
@@ -109,13 +107,6 @@ void RTLsTy::loadRTLs() {
RTLInfoTy &RTL = AllRTLs.back();
const std::string BaseRTLName(Name);
- if (NextGenPlugins) {
- if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
- continue;
-
- DP("Falling back to original plugin...\n");
- }
-
if (!attemptLoadRTL(BaseRTLName + ".so", RTL))
AllRTLs.pop_back();
}
More information about the Openmp-commits
mailing list