[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)
Alex Duran via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 10:02:17 PDT 2025
https://github.com/adurang updated https://github.com/llvm/llvm-project/pull/158900
>From 0c427647d9ce0de9506992dfb16074178bebcc19 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 11:46:48 +0200
Subject: [PATCH 01/13] [OFFLOAD] Add plugin with support for Intel Level Zero
---
offload/CMakeLists.txt | 17 +-
.../Modules/LibomptargetGetDependencies.cmake | 21 +
offload/include/OpenMP/InteropAPI.h | 7 +-
offload/include/PerThreadTable.h | 155 ++-
.../plugins-nextgen/common/include/DLWrap.h | 16 +
.../plugins-nextgen/level_zero/CMakeLists.txt | 69 ++
.../level_zero/include/AsyncQueue.h | 50 +
.../level_zero/include/L0Context.h | 138 +++
.../level_zero/include/L0Defs.h | 73 ++
.../level_zero/include/L0Device.h | 680 +++++++++++
.../level_zero/include/L0Interop.h | 25 +
.../level_zero/include/L0Kernel.h | 154 +++
.../level_zero/include/L0Memory.h | 574 +++++++++
.../level_zero/include/L0Options.h | 189 +++
.../level_zero/include/L0Plugin.h | 136 +++
.../level_zero/include/L0Program.h | 135 +++
.../level_zero/include/L0Trace.h | 193 +++
.../plugins-nextgen/level_zero/include/TLS.h | 86 ++
.../level_zero/src/L0Context.cpp | 41 +
.../level_zero/src/L0Device.cpp | 1065 +++++++++++++++++
.../level_zero/src/L0DynWrapper.cpp | 134 +++
.../level_zero/src/L0Kernel.cpp | 649 ++++++++++
.../level_zero/src/L0Memory.cpp | 637 ++++++++++
.../level_zero/src/L0Options.cpp | 371 ++++++
.../level_zero/src/L0Plugin.cpp | 285 +++++
.../level_zero/src/L0Program.cpp | 625 ++++++++++
.../level_zero/src/OmpWrapper.cpp | 71 ++
27 files changed, 6586 insertions(+), 10 deletions(-)
create mode 100644 offload/plugins-nextgen/level_zero/CMakeLists.txt
create mode 100644 offload/plugins-nextgen/level_zero/include/AsyncQueue.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Context.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Defs.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Device.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Interop.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Kernel.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Memory.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Options.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Plugin.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Program.h
create mode 100644 offload/plugins-nextgen/level_zero/include/L0Trace.h
create mode 100644 offload/plugins-nextgen/level_zero/include/TLS.h
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Context.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Device.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Memory.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Options.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/L0Program.cpp
create mode 100644 offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index b277380783500..8a704ab05eb53 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
endif()
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
- "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+ "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
@@ -176,6 +176,19 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
endif()
endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
+ CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
+ if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+ message(STATUS "Not building Level Zero plugin: it is only supported on "
+ "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+ list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+ endif()
+endif()
+if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
+ NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+ message(STATUS "Not building Level Zero plugin: dependencies not found")
+ list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+endif()
message(STATUS "Building the offload library with support for "
"the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 2a8bdebf2c1dd..0af0ae1ecdbec 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -89,4 +89,25 @@ if(LIBOMPTARGET_AMDGPU_ARCH)
endif()
endif()
+################################################################################
+# Looking for Level0
+################################################################################
+message(STATUS "Looking for Level0 includes.")
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
+
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
+ set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+ message(STATUS "Could NOT find Level Zero. Missing includes.")
+else()
+ message(STATUS "Level Zero include DIR: ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}")
+ set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
+ message(STATUS "Looking for Level Zero library.")
+ find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
+ if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES)
+ message(STATUS "Could NOT find Level Zero. Missing library.")
+ else()
+ message(STATUS "Level Zero library: ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES}")
+ endif()
+endif()
+
set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..2553bfa930784 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -160,17 +160,12 @@ struct InteropTableEntry {
Interops.push_back(obj);
}
- template <class ClearFuncTy> void clear(ClearFuncTy f) {
- for (auto &Obj : Interops) {
- f(Obj);
- }
- }
-
/// vector interface
int size() const { return Interops.size(); }
iterator begin() { return Interops.begin(); }
iterator end() { return Interops.end(); }
iterator erase(iterator it) { return Interops.erase(it); }
+ void clear() { Interops.clear(); }
};
struct InteropTblTy
diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 45b196171b4c8..0241370953c67 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -16,6 +16,60 @@
#include <list>
#include <memory>
#include <mutex>
+#include <type_traits>
+
+template <typename ObjectType> struct PerThread {
+ struct PerThreadData {
+ std::unique_ptr<ObjectType> ThEntry;
+ };
+
+ std::mutex Mtx;
+ std::list<std::shared_ptr<PerThreadData>> ThreadDataList;
+
+ // define default constructors, disable copy and move constructors
+ PerThread() = default;
+ PerThread(const PerThread &) = delete;
+ PerThread(PerThread &&) = delete;
+ PerThread &operator=(const PerThread &) = delete;
+ PerThread &operator=(PerThread &&) = delete;
+ ~PerThread() {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ ThreadDataList.clear();
+ }
+
+private:
+ PerThreadData &getThreadData() {
+ static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
+ if (!ThData) {
+ ThData = std::make_shared<PerThreadData>();
+ std::lock_guard<std::mutex> Lock(Mtx);
+ ThreadDataList.push_back(ThData);
+ }
+ return *ThData;
+ }
+
+protected:
+ ObjectType &getThreadEntry() {
+ auto &ThData = getThreadData();
+ if (ThData.ThEntry)
+ return *ThData.ThEntry;
+ ThData.ThEntry = std::make_unique<ObjectType>();
+ return *ThData.ThEntry;
+ }
+
+public:
+ ObjectType &get() { return getThreadEntry(); }
+
+ template <class F> void clear(F f) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ for (auto ThData : ThreadDataList) {
+ if (!ThData->ThEntry)
+ continue;
+ f(*ThData->ThEntry);
+ }
+ ThreadDataList.clear();
+ }
+};
// Using an STL container (such as std::vector) indexed by thread ID has
// too many race conditions issues so we store each thread entry into a
@@ -23,10 +77,32 @@
// T is the container type used to store the objects, e.g., std::vector,
// std::set, etc. by each thread. O is the type of the stored objects e.g.,
// omp_interop_val_t *, ...
-
template <typename ContainerType, typename ObjectType> struct PerThreadTable {
using iterator = typename ContainerType::iterator;
+ template <typename, typename = std::void_t<>>
+ struct has_iterator : std::false_type {};
+ template <typename T>
+ struct has_iterator<T, std::void_t<typename T::iterator>> : std::true_type {};
+
+ template <typename T, typename = std::void_t<>>
+ struct has_clear : std::false_type {};
+ template <typename T>
+ struct has_clear<T, std::void_t<decltype(std::declval<T>().clear())>>
+ : std::true_type {};
+
+ template <typename T, typename = std::void_t<>>
+ struct has_clearAll : std::false_type {};
+ template <typename T>
+ struct has_clearAll<T, std::void_t<decltype(std::declval<T>().clearAll(1))>>
+ : std::true_type {};
+
+ template <typename, typename = std::void_t<>>
+ struct is_associative : std::false_type {};
+ template <typename T>
+ struct is_associative<T, std::void_t<typename T::mapped_type>>
+ : std::true_type {};
+
struct PerThreadData {
size_t NElements = 0;
std::unique_ptr<ContainerType> ThEntry;
@@ -71,6 +147,11 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
return ThData.NElements;
}
+ void setNElements(size_t Size) {
+ auto &NElements = getThreadNElements();
+ NElements = Size;
+ }
+
public:
void add(ObjectType obj) {
auto &Entry = getThreadEntry();
@@ -104,11 +185,81 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
for (auto ThData : ThreadDataList) {
if (!ThData->ThEntry || ThData->NElements == 0)
continue;
- ThData->ThEntry->clear(f);
+ if constexpr (has_clearAll<ContainerType>::value) {
+ ThData->ThEntry->clearAll(f);
+ } else if constexpr (has_iterator<ContainerType>::value &&
+ has_clear<ContainerType>::value) {
+ for (auto &Obj : *ThData->ThEntry) {
+ if constexpr (is_associative<ContainerType>::value) {
+ f(Obj.second);
+ } else {
+ f(Obj);
+ }
+ }
+ ThData->ThEntry->clear();
+ } else {
+ static_assert(true, "Container type not supported");
+ }
ThData->NElements = 0;
}
ThreadDataList.clear();
}
};
+template <typename T, typename = std::void_t<>> struct ContainerValueType {
+ using type = typename T::value_type;
+};
+template <typename T>
+struct ContainerValueType<T, std::void_t<typename T::mapped_type>> {
+ using type = typename T::mapped_type;
+};
+
+template <typename ContainerType, size_t reserveSize = 0>
+struct PerThreadContainer
+ : public PerThreadTable<ContainerType,
+ typename ContainerValueType<ContainerType>::type> {
+
+ // helpers
+ template <typename T, typename = std::void_t<>> struct indexType {
+ using type = typename T::size_type;
+ };
+ template <typename T> struct indexType<T, std::void_t<typename T::key_type>> {
+ using type = typename T::key_type;
+ };
+ template <typename T, typename = std::void_t<>>
+ struct has_resize : std::false_type {};
+ template <typename T>
+ struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(1))>>
+ : std::true_type {};
+
+ template <typename T, typename = std::void_t<>>
+ struct has_reserve : std::false_type {};
+ template <typename T>
+ struct has_reserve<T, std::void_t<decltype(std::declval<T>().reserve(1))>>
+ : std::true_type {};
+
+ using IndexType = typename indexType<ContainerType>::type;
+ using ObjectType = typename ContainerValueType<ContainerType>::type;
+
+ // Get the object for the given index in the current thread
+ ObjectType &get(IndexType Index) {
+ auto &Entry = this->getThreadEntry();
+
+ // specialized code for vector-like containers
+ if constexpr (has_resize<ContainerType>::value) {
+ if (Index >= Entry.size()) {
+ if constexpr (has_reserve<ContainerType>::value && reserveSize > 0) {
+ if (Entry.capacity() < reserveSize)
+ Entry.reserve(reserveSize);
+ }
+ // If the index is out of bounds, try resize the container
+ Entry.resize(Index + 1);
+ }
+ }
+ ObjectType &Ret = Entry[Index];
+ this->setNElements(Entry.size());
+ return Ret;
+ }
+};
+
#endif
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
index 8934e7e701021..95ce86e123cd3 100644
--- a/offload/plugins-nextgen/common/include/DLWrap.h
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -282,5 +282,21 @@ template <size_t Requested, size_t Required> constexpr void verboseAssert() {
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
x9, x10); \
}
+#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8, \
+ typename T::template arg<9>::type x9, \
+ typename T::template arg<10>::type x10, \
+ typename T::template arg<11>::type x11) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9, x10, x11); \
+ }
#endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000000..b9c8dd423c3ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -0,0 +1,69 @@
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+return()
+endif()
+
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
+
+set(LEVEL_ZERO_SRC_FILES
+ src/L0Context.cpp
+ src/L0Device.cpp
+ src/L0Kernel.cpp
+ src/L0Memory.cpp
+ src/L0Program.cpp
+ src/L0Plugin.cpp
+ src/L0Program.cpp
+ src/L0Options.cpp
+)
+list(APPEND LEVEL_ZERO_SRC_FILES
+ src/OmpWrapper.cpp
+)
+
+target_sources(omptarget.rtl.level_zero PRIVATE
+ ${LEVEL_ZERO_SRC_FILES}
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+ ${LIBOMPTARGET_INCLUDE_DIR}
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+ ${LIBOMPTARGET_OMP_HEADER_DIR}
+)
+
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ target_link_libraries(omptarget.rtl.level_zero PRIVATE
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+ # Full path to the L0 library is recognized as a linker option, so we
+ # separate directory and file name
+ get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+ get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+ target_link_libraries(omptarget.rtl.level_zero PRIVATE
+ ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+ target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
+ target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+ libomptarget_add_resource_file(omptarget.rtl.level_zero)
+else()
+ message(FATAL_ERROR "Missing platfrom support")
+endif()
+
+else()
+message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+ # Windows uses dll instead of lib files at runtime
+ string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
+endif()
+target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+endif()
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
new file mode 100644
index 0000000000000..105f68205e402
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -0,0 +1,50 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Async Queue wrapper for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <vector>
+
+#include "L0Memory.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Abstract queue that supports asynchronous command submission
+struct AsyncQueueTy {
+ /// List of events attahced to submitted commands
+ std::vector<ze_event_handle_t> WaitEvents;
+ /// Pending staging buffer to host copies
+ std::list<std::tuple<void *, void *, size_t>> H2MList;
+ /// Pending USM memory copy commands that must wait for kernel completion
+ std::list<std::tuple<const void *, void *, size_t>> USM2MList;
+ /// Kernel event not signaled
+ ze_event_handle_t KernelEvent = nullptr;
+ /// Is this queue being used currently
+ bool InUse = false;
+ /// Clear data
+ void reset() {
+ WaitEvents.clear();
+ H2MList.clear();
+ USM2MList.clear();
+ KernelEvent = nullptr;
+ }
+};
+
+typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
new file mode 100644
index 0000000000000..b2b6def8101ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -0,0 +1,138 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Memory.h"
+#include "PerThreadTable.h"
+
+namespace llvm::omp::target::plugin {
+
+class LevelZeroPluginTy;
+
+class L0ContextTLSTy {
+ StagingBufferTy StagingBuffer;
+
+public:
+ auto &getStagingBuffer() { return StagingBuffer; }
+ const auto &getStagingBuffer() const { return StagingBuffer; }
+
+ void clear() { StagingBuffer.clear(); }
+};
+
+struct L0ContextTLSTableTy
+ : public PerThreadContainer<
+ std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
+ void clear() {
+ PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+ }
+};
+
+/// Driver and context-specific resources. We assume a single context per
+/// driver.
+class L0ContextTy {
+ /// The plugin that created this context
+ LevelZeroPluginTy &Plugin;
+
+ /// Level Zero Driver handle
+ ze_driver_handle_t zeDriver = nullptr;
+
+ /// Common Level Zero context
+ ze_context_handle_t zeContext = nullptr;
+
+ /// API version supported by the Level Zero driver
+ ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT;
+
+ /// Imported external pointers. Track this only for user-directed
+ /// imports/releases.
+ std::unordered_map<uintptr_t, size_t> ImportedPtrs;
+
+ /// Common event pool
+ EventPoolTy EventPool;
+
+ /// Host Memory allocator for this driver
+ MemAllocatorTy HostMemAllocator;
+
+public:
+ /// Named constants for checking the imported external pointer regions.
+ static constexpr int32_t ImportNotExist = -1;
+ static constexpr int32_t ImportUnknown = 0;
+ static constexpr int32_t ImportExist = 1;
+
+ /// Create context, initialize event pool and extension functions
+ L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+ int32_t DriverId);
+
+ L0ContextTy(const L0ContextTy &) = delete;
+ L0ContextTy(L0ContextTy &&) = delete;
+ L0ContextTy &operator=(const L0ContextTy &) = delete;
+ L0ContextTy &operator=(const L0ContextTy &&) = delete;
+
+ /// Release resources
+ ~L0ContextTy() {
+ EventPool.deinit();
+ HostMemAllocator.deinit();
+ if (zeContext)
+ CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+ }
+
+ auto &getPlugin() const { return Plugin; }
+
+ StagingBufferTy &getStagingBuffer();
+
+ /// Add imported external pointer region.
+ void addImported(void *Ptr, size_t Size) {
+ (void)ImportedPtrs.emplace((uintptr_t)Ptr, Size);
+ }
+
+ /// Remove imported external pointer region
+ void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); }
+
+ /// Check if imported regions contain the specified region.
+ int32_t checkImported(void *Ptr, size_t Size) const {
+ uintptr_t LB = (uintptr_t)Ptr;
+ uintptr_t UB = LB + Size;
+ // We do not expect a large number of user-directed imports, so use simple
+ // logic.
+ for (auto &I : ImportedPtrs) {
+ uintptr_t ILB = I.first;
+ uintptr_t IUB = ILB + I.second;
+ if (LB >= ILB && UB <= IUB)
+ return ImportExist;
+ if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB))
+ return ImportUnknown;
+ }
+ return ImportNotExist;
+ }
+
+ ze_driver_handle_t getZeDriver() const { return zeDriver; }
+
+ /// Return context associated with the driver
+ ze_context_handle_t getZeContext() const { return zeContext; }
+
+ /// Return driver API version
+ ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
+
+ /// Return the event pool of this driver
+ auto &getEventPool() { return EventPool; }
+ const auto &getEventPool() const { return EventPool; }
+
+ bool supportsLargeMem() const {
+ // Large memory support is available since API version 1.1
+ return getDriverAPIVersion() >= ZE_API_VERSION_1_1;
+ }
+
+ const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
+ MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
new file mode 100644
index 0000000000000..81566f52a2aea
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -0,0 +1,73 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// External and other auxilary definitions
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "PluginInterface.h"
+#include "Shared/Requirements.h"
+#include "omptarget.h"
+
+#define LIBOMP_DECL(RetType, FnDecl) RetType __cdecl FnDecl
+
+enum class AllocOptionTy : int32_t {
+ ALLOC_OPT_NONE = 0,
+ ALLOC_OPT_REDUCTION_SCRATCH = 1,
+ ALLOC_OPT_REDUCTION_COUNTER = 2,
+ ALLOC_OPT_HOST_MEM = 3,
+ ALLOC_OPT_SLM = 4,
+};
+
+/// Host runtime routines being used
+extern "C" {
+LIBOMP_DECL(int, omp_get_max_teams(void));
+LIBOMP_DECL(int, omp_get_thread_limit(void));
+LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
+LIBOMP_DECL(double, omp_get_wtime(void));
+} // extern "C"
+
+#ifndef EXTRACT_BITS
+// MSB=63, LSB=0
+#define EXTRACT_BITS(I64, HIGH, LOW) \
+ (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
+#endif
+
+namespace llvm::omp::target::plugin {
+
+/// Default alignmnet for allocation
+constexpr size_t L0Alignment = 0;
+/// Default staging buffer size for host to device copy (16KB)
+constexpr size_t L0StagingBufferSize = (1 << 14);
+/// Default staging buffer count
+constexpr size_t L0StagingBufferCount = 64;
+/// USM allocation threshold where preallocation does not pay off (128MB)
+constexpr size_t L0UsmPreAllocThreshold = (128 << 20);
+/// Host USM allocation threshold where preallocation does not pay off (8MB)
+constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20);
+
+using namespace error;
+/// Generic L0 handle type
+using ZeHandleTy = void *;
+
+template <typename... ArgsTy>
+static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+
+ if (Code == OFFLOAD_SUCCESS)
+ return Plugin::success();
+ const char *Desc = "Unknown error";
+ return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+ ErrFmt, Args..., Desc);
+}
+
+#define L0_UNIMPLEMENTED_ERR \
+ return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet\n", \
+ __func__);
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
new file mode 100644
index 0000000000000..6acfa7e0ee67d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -0,0 +1,680 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+ None = 0x0000,
+ SKL = 0x1900,
+ KBL = 0x5900,
+ CFL = 0x3E00,
+ CFL_2 = 0x9B00,
+ ICX = 0x8A00,
+ TGL = 0xFF20,
+ TGL_2 = 0x9A00,
+ DG1 = 0x4900,
+ RKL = 0x4C00,
+ ADLS = 0x4600,
+ RTL = 0xA700,
+ MTL = 0x7D00,
+ PVC = 0x0B00,
+ DG2_ATS_M = 0x4F00,
+ DG2_ATS_M_2 = 0x5600,
+ LNL = 0x6400,
+ BMG = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+ DeviceArch_None = 0,
+ DeviceArch_Gen = 0x0001, // Gen 9, Gen 11 or Xe
+ DeviceArch_XeLPG = 0x0002,
+ DeviceArch_XeHPC = 0x0004,
+ DeviceArch_XeHPG = 0x0008,
+ DeviceArch_Xe2LP = 0x0010,
+ DeviceArch_Xe2HP = 0x0020,
+ DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+ ze_device_handle_t zeId;
+ int32_t RootId;
+ int32_t SubId;
+ int32_t CCSId;
+
+ L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+ int32_t CCSId = -1)
+ : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+ /// Command list for each device
+ ze_command_list_handle_t CmdList = nullptr;
+
+ /// Main copy command list for each device
+ ze_command_list_handle_t CopyCmdList = nullptr;
+
+ /// Link copy command list for each device
+ ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+ /// Command queue for each device
+ ze_command_queue_handle_t CmdQueue = nullptr;
+
+ /// Main copy command queue for each device
+ ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+ /// Link copy command queues for each device
+ ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+ /// Immediate command list for each device
+ ze_command_list_handle_t ImmCmdList = nullptr;
+
+ /// Immediate copy command list for each device
+ ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+ L0DeviceTLSTy() = default;
+ ~L0DeviceTLSTy() {
+ // assert all fields are nullptr on destruction
+ assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+ assert(CopyCmdList == nullptr &&
+ "CopyCmdList is not nullptr on destruction");
+ assert(LinkCopyCmdList == nullptr &&
+ "LinkCopyCmdList is not nullptr on destruction");
+ assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+ assert(CopyCmdQueue == nullptr &&
+ "CopyCmdQueue is not nullptr on destruction");
+ assert(LinkCopyCmdQueue == nullptr &&
+ "LinkCopyCmdQueue is not nullptr on destruction");
+ assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+ assert(ImmCopyCmdList == nullptr &&
+ "ImmCopyCmdList is not nullptr on destruction");
+ }
+
+ L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+ L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+ CmdList = std::exchange(Other.CmdList, nullptr);
+ CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+ LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+ CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+ CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+ LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+ ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+ ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+ }
+
+ void clear() {
+ // destroy all lists and queues
+ if (CmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+ if (CopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+ if (LinkCopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+ if (ImmCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+ if (ImmCopyCmdList)
+ CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+ if (CmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+ if (CopyCmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+ if (LinkCopyCmdQueue)
+ CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+ CmdList = nullptr;
+ CopyCmdList = nullptr;
+ LinkCopyCmdList = nullptr;
+ CmdQueue = nullptr;
+ CopyCmdQueue = nullptr;
+ LinkCopyCmdQueue = nullptr;
+ ImmCmdList = nullptr;
+ ImmCopyCmdList = nullptr;
+ }
+
+ L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+ L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+ auto getCmdList() const { return CmdList; }
+ void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+ auto getCopyCmdList() const { return CopyCmdList; }
+ void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+ CopyCmdList = _CopyCmdList;
+ }
+
+ auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+ void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+ LinkCopyCmdList = _LinkCopyCmdList;
+ }
+
+ auto getImmCmdList() const { return ImmCmdList; }
+ void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+ ImmCmdList = _ImmCmdList;
+ }
+
+ auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+ void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+ ImmCopyCmdList = _ImmCopyCmdList;
+ }
+
+ auto getCmdQueue() const { return CmdQueue; }
+ void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+ CmdQueue = _CmdQueue;
+ }
+
+ auto getCopyCmdQueue() const { return CopyCmdQueue; }
+ void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+ CopyCmdQueue = _CopyCmdQueue;
+ }
+
+ auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+ void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+ LinkCopyCmdQueue = _LinkCopyCmdQueue;
+ }
+};
+
+struct L0DeviceTLSTableTy
+ : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+ void clear() {
+ PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+ }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+ // Level Zero Context for this Device
+ L0ContextTy &l0Context;
+
+ // Level Zero handle for this Device
+ ze_device_handle_t zeDevice;
+ // Device Properties
+ ze_device_properties_t DeviceProperties{};
+ ze_device_compute_properties_t ComputeProperties{};
+ ze_device_memory_properties_t MemoryProperties{};
+ ze_device_cache_properties_t CacheProperties{};
+
+ /// Devices' default target allocation kind for internal allocation
+ int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+ DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+ std::string DeviceName;
+
+ /// Common indirect access flags for this device
+ ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+ /// Device UUID for toplevel devices only
+ std::string DeviceUuid;
+
+ /// L0 Device ID as string
+ std::string zeId;
+
+ /// Command queue group ordinals for each device
+ std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+ /// Command queue group ordinals for copying
+ std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+ /// Command queue group ordinals and number of queues for link copy engines
+ std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+ /// Command queue index for each device
+ uint32_t ComputeIndex = 0;
+
+ bool IsAsyncEnabled = false;
+
+ // lock for this device
+ std::mutex Mutex;
+
+ /// Contains all modules (possibly from multiple device images) to handle
+ /// dynamic link across multiple images
+ llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+ /// L0 programs created for this device
+ std::list<L0ProgramTy> Programs;
+
+ /// MemAllocator for this device
+ MemAllocatorTy MemAllocator;
+
+ /// The current size of the global device memory pool (managed by us).
+ uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+ int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+ int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+ int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo);
+
+ bool shouldSetupDeviceMemoryPool() const override { return false; }
+ DeviceArchTy computeArch() const;
+
+ /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+ std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+ /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+ std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+ Error internalInit();
+
+public:
+ L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+ ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+ const std::string &zeId, int32_t ComputeIndex)
+ : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+ l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+ ComputeIndex(ComputeIndex) {
+ DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ DeviceProperties.pNext = nullptr;
+ ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+ ComputeProperties.pNext = nullptr;
+ MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+ MemoryProperties.pNext = nullptr;
+ CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+ CacheProperties.pNext = nullptr;
+
+ auto Err = internalInit();
+ if (Err) {
+ FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+ toString(std::move(Err)).c_str());
+ }
+ }
+
+ static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+ return static_cast<L0DeviceTy &>(Device);
+ }
+ static L0DeviceTy &makeL0Device(ompt_device_t *Device) {
+ return *static_cast<L0DeviceTy *>(Device);
+ }
+
+ auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+ L0DeviceTLSTy &getTLS();
+
+ Error setContext() override { return Plugin::success(); }
+ Error initImpl(GenericPluginTy &Plugin) override;
+ Error deinitImpl() override {
+ Programs.clear();
+ return Plugin::success();
+ }
+
+ auto getZeDevice() const { return zeDevice; }
+
+ const L0ContextTy &getL0Context() const { return l0Context; }
+ L0ContextTy &getL0Context() { return l0Context; }
+
+ const std::string &getName() const { return DeviceName; }
+ const char *getNameCStr() const { return DeviceName.c_str(); }
+
+ const std::string &getZeId() const { return zeId; }
+ const char *getZeIdCStr() const { return zeId.c_str(); }
+
+ std::mutex &getMutex() { return Mutex; }
+
+ auto getComputeIndex() const { return ComputeIndex; }
+ auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+ auto getNumGlobalModules() const { return GlobalModules.size(); }
+ void addGlobalModule(ze_module_handle_t Module) {
+ GlobalModules.push_back(Module);
+ }
+ auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+ L0ProgramTy *getProgramFromImage(const __tgt_device_image *Image) {
+ for (auto &PGM : Programs)
+ if (PGM.getTgtImage() == Image)
+ return &PGM;
+ return nullptr;
+ }
+
+ int32_t buildAllKernels() {
+ for (auto &PGM : Programs) {
+ int32_t RC = PGM.loadModuleKernels();
+ if (RC != OFFLOAD_SUCCESS)
+ return RC;
+ }
+ return OFFLOAD_SUCCESS;
+ }
+
+ // add a new program to the device. Return a reference to the new program
+ auto &addProgram(int32_t ImageId, const __tgt_device_image *Image) {
+ Programs.emplace_back(ImageId, *this, Image);
+ return Programs.back();
+ }
+
+ const auto &getLastProgram() const { return Programs.back(); }
+ auto &getLastProgram() { return Programs.back(); }
+ // Device properties getters
+ auto getVendorId() const { return DeviceProperties.vendorId; }
+ bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+ auto getPCIId() const { return DeviceProperties.deviceId; }
+ auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+ auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+ auto getNumEUsPerSubslice() const {
+ return DeviceProperties.numEUsPerSubslice;
+ }
+ auto getNumSubslicesPerSlice() const {
+ return DeviceProperties.numSubslicesPerSlice;
+ }
+ auto getNumSlices() const { return DeviceProperties.numSlices; }
+ auto getNumSubslices() const {
+ return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+ }
+ uint32_t getNumEUs() const {
+ return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+ }
+ auto getTotalThreads() const {
+ return DeviceProperties.numThreadsPerEU * getNumEUs();
+ }
+ auto getNumThreadsPerSubslice() const {
+ return getNumEUsPerSubslice() * getNumThreadsPerEU();
+ }
+ auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+ auto getMaxSharedLocalMemory() const {
+ return ComputeProperties.maxSharedLocalMemory;
+ }
+ auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+ auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+ auto getCacheSize() const { return CacheProperties.cacheSize; }
+
+ int32_t getAllocKind() const { return AllocKind; }
+ DeviceArchTy getDeviceArch() const { return DeviceArch; }
+ bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+ static bool isDiscrete(uint32_t PCIId) {
+ switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+ case PCIIdTy::BMG:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ static bool isDiscrete(ze_device_handle_t Device) {
+ ze_device_properties_t PR{};
+ PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ PR.pNext = nullptr;
+ CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+ return isDiscrete(PR.deviceId);
+ }
+
+ bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+ bool isDeviceIPorNewer(uint32_t Version) const;
+
+ const std::string &getUuid() const { return DeviceUuid; }
+
+ uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+ uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+ bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+ uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+ uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+ uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+ bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+ bool deviceRequiresImmCmdList() const {
+ return isDeviceIPorNewer(0x05004000);
+ }
+ bool asyncEnabled() const { return IsAsyncEnabled; }
+ bool useImmForCompute() const { return true; }
+ bool useImmForCopy() const { return true; }
+ bool useImmForInterop() const { return true; }
+ bool forceInorderInterop() const { return true; }
+
+ void reportDeviceInfo() const;
+
+ // Command queues related functions
+ /// Create a command list with given ordinal and flags
+ ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal,
+ ze_command_list_flags_t Flags,
+ const std::string &DeviceIdStr);
+
+ /// Create a command list with default flags
+ ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal,
+ const std::string &DeviceIdStr);
+
+ ze_command_list_handle_t getCmdList();
+
+ /// Create a command queue with given ordinal and flags
+ ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal, uint32_t Index,
+ ze_command_queue_flags_t Flags,
+ const std::string &DeviceIdStr);
+
+ /// Create a command queue with default flags
+ ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+ ze_device_handle_t Device,
+ uint32_t Ordinal, uint32_t Index,
+ const std::string &DeviceIdStr,
+ bool InOrder = false);
+
+ /// Create a new command queue for the given OpenMP device ID
+ ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+ /// Create an immediate command list
+ ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+ bool InOrder = false);
+
+ /// Create an immediate command list for computing
+ ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+ return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+ }
+
+ /// Create an immediate command list for copying
+ ze_command_list_handle_t createImmCopyCmdList();
+ ze_command_queue_handle_t getCmdQueue();
+ ze_command_list_handle_t getCopyCmdList();
+ ze_command_queue_handle_t getCopyCmdQueue();
+ ze_command_list_handle_t getLinkCopyCmdList();
+ ze_command_queue_handle_t getLinkCopyCmdQueue();
+ ze_command_list_handle_t getImmCmdList();
+ ze_command_list_handle_t getImmCopyCmdList();
+
+ /// Enqueue copy command
+ int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+ __tgt_async_info *AsyncInfo = nullptr,
+ bool Locked = false, bool UseCopyEngine = true);
+
+ /// Enqueue asynchronous copy command
+ int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+ __tgt_async_info *AsyncInfo, bool CopyTo = true);
+
+ /// Enqueue fill command
+ int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+ size_t Size);
+
+ /// Driver related functions
+
+ /// Reurn the driver handle for this device
+ ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); }
+
+ /// Return context for this device
+ ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); }
+
+ /// Return driver API version for this device
+ ze_api_version_t getDriverAPIVersion() const {
+ return l0Context.getDriverAPIVersion();
+ }
+
+ /// Return an event from the driver associated to this device
+ ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+
+ /// Release event to the pool associated to this device
+ void releaseEvent(ze_event_handle_t Event) {
+ l0Context.getEventPool().releaseEvent(Event, *this);
+ }
+
+ StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
+
+ bool supportsLargeMem() const { return l0Context.supportsLargeMem(); }
+
+ // Allocation related routines
+
+ /// Data alloc
+ void *dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+ bool UserAlloc, bool DevMalloc = false,
+ uint32_t MemAdvice = UINT32_MAX,
+ AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+
+ /// Data delete
+ int32_t dataDelete(void *Ptr);
+
+ /// Return the memory allocation type for the specified memory location.
+ uint32_t getMemAllocType(const void *Ptr) const;
+
+ const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; }
+ MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; }
+
+ MemAllocatorTy &getMemAllocator(int32_t Kind) {
+ if (Kind == TARGET_ALLOC_HOST)
+ return l0Context.getHostMemAllocator();
+ return getDeviceMemAllocator();
+ }
+
+ MemAllocatorTy &getMemAllocator(const void *Ptr) {
+ bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr));
+ if (IsHostMem)
+ return l0Context.getHostMemAllocator();
+ return getDeviceMemAllocator();
+ }
+
+ int32_t makeMemoryResident(void *Mem, size_t Size);
+
+ // Generic device interface implementation
+ Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+ int32_t ImageId) override;
+ Error unloadBinaryImpl(DeviceImageTy *Image) override;
+ void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+ int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+
+ Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
+ return Plugin::error(error::ErrorCode::UNKNOWN,
+ "dataLockImpl not supported");
+ }
+ Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
+
+ Expected<bool> isPinnedPtrImpl(void *, void *&, void *&,
+ size_t &) const override {
+ // Don't need to do anything, this is handled by the driver.
+ return false;
+ }
+
+ Error dataFence(__tgt_async_info *Async) override;
+ Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+ bool ReleaseQueue) override;
+ Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override;
+ Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+ void *DstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error initDeviceInfoImpl(__tgt_device_info *Info) override;
+ Expected<bool>
+ hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+ Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+ AsyncInfoWrapperTy &AsyncInfo) override{
+ L0_UNIMPLEMENTED_ERR}
+
+ /* Event routines are used to ensure ordering between dataTransfers. Instead
+ * of adding extra events in the queues, we make sure they're ordered by
+ * using the events from the data submission APIs so we don't need to support
+ * these routines.
+ * They still need to report succes to indicate the event are handled
+ * somewhere waitEvent and syncEvent should remain unimplemented
+ */
+ Expected<bool> isEventCompleteImpl(void *EventPtr,
+ AsyncInfoWrapperTy &) override {
+ return true;
+ }
+
+ Error createEventImpl(void **EventPtrStorage) override {
+ return Plugin::success();
+ }
+ Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+ Error recordEventImpl(void *EventPtr,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::success();
+ }
+
+ Error waitEventImpl(void *EventPtr,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+ __func__);
+ }
+
+ Error syncEventImpl(void *EventPtr) override {
+ return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+ __func__);
+ }
+
+ Expected<InfoTreeNode> obtainInfoImpl() override;
+
+ Error getDeviceStackSize(uint64_t &V) override {
+ V = 0;
+ return Plugin::success();
+ }
+ Expected<GenericKernelTy &> constructKernel(const char *Name) override;
+
+ Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); }
+ Error getDeviceHeapSize(uint64_t &V) override {
+ V = HeapSize;
+ return Plugin::success();
+ }
+ Error setDeviceHeapSize(uint64_t V) override {
+ HeapSize = V;
+ return Plugin::success();
+ }
+
+ Expected<omp_interop_val_t *>
+ createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override;
+ Error releaseInterop(omp_interop_val_t *Interop) override;
+
+ interop_spec_t selectInteropPreference(int32_t InteropType,
+ int32_t NumPrefers,
+ interop_spec_t *Prefers) override;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
new file mode 100644
index 0000000000000..4b8b417f9b339
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -0,0 +1,25 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interop support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace llvm::omp::target::plugin::L0Interop {
+
+/// Level Zero interop property
+struct Property {
+ // Use this when command queue needs to be accessed as
+ // the targetsync field in interop will be changed if preferred type is sycl.
+ ze_command_queue_handle_t CommandQueue;
+ ze_command_list_handle_t ImmCmdList;
+};
+
+} // namespace llvm::omp::target::plugin::L0Interop
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
new file mode 100644
index 0000000000000..bc6fc54cdea08
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -0,0 +1,154 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+#include "PluginInterface.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+class L0ProgramTy;
+
+/// Loop descriptor
+struct TgtLoopDescTy {
+ int64_t Lb = 0; // The lower bound of the i-th loop
+ int64_t Ub = 0; // The upper bound of the i-th loop
+ int64_t Stride = 0; // The stride of the i-th loop
+};
+
+struct TgtNDRangeDescTy {
+ int32_t NumLoops = 0; // Number of loops/dimensions
+ int32_t DistributeDim = 0; // Dimensions lower than this one
+ // must end up in one WG
+ TgtLoopDescTy Levels[3]; // Up to 3 loops
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+ uint32_t Width = 0;
+ uint32_t SIMDWidth = 0;
+ uint32_t MaxThreadGroupSize = 0;
+
+ /// Cached input parameters used in the previous launch
+ TgtNDRangeDescTy LoopDesc;
+ int32_t NumTeams = -1;
+ int32_t ThreadLimit = -1;
+
+ /// Cached parameters used in the previous launch
+ ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX;
+ uint32_t GroupSizes[3] = {0, 0, 0};
+ ze_group_count_t GroupCounts{0, 0, 0};
+ bool AllowCooperative = false;
+
+ std::mutex Mtx;
+
+ static constexpr TgtNDRangeDescTy LoopDescInit = {};
+
+ /// Check if we can reuse group parameters.
+ bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+ const int32_t _NumTeams, const int32_t _ThreadLimit,
+ uint32_t *_GroupSizes, ze_group_count_t &_GroupCounts,
+ bool &_AllowCooperative) const {
+ if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+ return false;
+ if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+ return false;
+ if (_NumTeams != NumTeams || _ThreadLimit != ThreadLimit)
+ return false;
+ // Found matching input parameters.
+ std::copy_n(GroupSizes, 3, _GroupSizes);
+ _GroupCounts = GroupCounts;
+ _AllowCooperative = AllowCooperative;
+ return true;
+ }
+
+ /// Update cached group parameters.
+ void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+ const int32_t _NumTeams, const int32_t _ThreadLimit,
+ const uint32_t *_GroupSizes,
+ const ze_group_count_t &_GroupCounts,
+ const bool &_AllowCooperative) {
+ LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
+ NumTeams = _NumTeams;
+ ThreadLimit = _ThreadLimit;
+ std::copy_n(_GroupSizes, 3, GroupSizes);
+ GroupCounts = _GroupCounts;
+ AllowCooperative = _AllowCooperative;
+ }
+};
+
+class L0KernelTy : public GenericKernelTy {
+ // L0 Kernel Handle
+ ze_kernel_handle_t zeKernel;
+ // Kernel Properties
+ KernelPropertiesTy Properties;
+ auto &getProperties() { return Properties; }
+
+ int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ __tgt_async_info *AsyncInfo) const;
+
+ void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
+ uint32_t ThreadLimit,
+ TgtNDRangeDescTy *LoopLevels,
+ uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts,
+ bool HalfNumThreads,
+ bool IsTeamsNDRange) const;
+
+ int32_t decideLoopKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+ uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool &AllowCooperative) const;
+
+ Error buildKernel(L0ProgramTy &Program);
+
+public:
+ /// Create a L0 kernel with a name and an execution mode.
+ L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
+ ~L0KernelTy() {
+ if (zeKernel)
+ CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
+ }
+ L0KernelTy(const L0KernelTy &) = delete;
+ L0KernelTy(L0KernelTy &&) = delete;
+ L0KernelTy &operator=(const L0KernelTy &) = delete;
+ L0KernelTy &operator=(const L0KernelTy &&) = delete;
+
+ const auto &getProperties() const { return Properties; }
+
+ /// Initialize the L0 kernel.
+ Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
+ /// Launch the L0 kernel function.
+ Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
+ uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+ Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+ uint64_t DynamicMemSize) const override{
+ L0_UNIMPLEMENTED_ERR}
+
+ ze_kernel_handle_t getZeKernel() const {
+ return zeKernel;
+ }
+
+ int32_t getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+ int32_t ThreadLimit, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts, void *LoopDesc,
+ bool &AllowCooperative) const;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
new file mode 100644
index 0000000000000..50af80a19a93a
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -0,0 +1,574 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind) \
+ (Kind == TARGET_ALLOC_HOST \
+ ? "host memory" \
+ : (Kind == TARGET_ALLOC_SHARED \
+ ? "shared memory" \
+ : (Kind == TARGET_ALLOC_DEVICE ? "device memory" \
+ : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+ /// Base address memory is allocated from
+ uintptr_t AllocBase = 0;
+ /// Minimal size served by the current heap
+ size_t BlockSize = 0;
+ /// Max size served by the current heap
+ size_t MaxSize = 0;
+ /// Available memory blocks
+ uint32_t NumBlocks = 0;
+ /// Number of block descriptors
+ uint32_t NumBlockDesc = 0;
+ /// Number of block counters
+ uint32_t NumBlockCounter = 0;
+ /// List of memory block descriptors
+ uint64_t *BlockDesc = nullptr;
+ /// List of memory block counters
+ uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+ /// Location of device memory blocks
+ void *PoolBase = nullptr;
+ /// Heap size common to all heaps
+ size_t HeapSize = 0;
+ /// Number of heaps available
+ uint32_t NumHeaps = 0;
+ /// Heap descriptors (using fixed-size array to simplify memory allocation)
+ DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+ /// Base address allocated from compute runtime
+ void *Base = nullptr;
+ /// Allocation size known to users/libomptarget
+ size_t Size = 0;
+ /// TARGET_ALLOC kind
+ int32_t Kind = TARGET_ALLOC_DEFAULT;
+ /// Allocation from pool?
+ bool InPool = false;
+ /// Is implicit argument
+ bool ImplicitArg = false;
+
+ MemAllocInfoTy() = default;
+
+ MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+ bool _ImplicitArg)
+ : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+ ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+ /// Simple memory allocation statistics. Maintains numbers for pool allocation
+ /// and GPU RT allocation.
+ struct MemStatTy {
+ size_t Requested[2] = {0, 0}; // Requested bytes
+ size_t Allocated[2] = {0, 0}; // Allocated bytes
+ size_t Freed[2] = {0, 0}; // Freed bytes
+ size_t InUse[2] = {0, 0}; // Current memory in use
+ size_t PeakUse[2] = {0, 0}; // Peak bytes used
+ size_t NumAllocs[2] = {0, 0}; // Number of allocations
+ MemStatTy() = default;
+ };
+
+ /// Memory pool which enables reuse of already allocated blocks
+ /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+ /// memory.
+ /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+ /// -- Each memory block can allocate multiple fixed-size memory requested by
+ /// offload RT or user.
+ /// -- Memory allocation falls back to GPU RT allocation when the pool size
+ /// (total memory used by pool) reaches a threshold.
+ class MemPoolTy {
+
+ /// Memory block maintained in each bucket
+ struct BlockTy {
+ /// Base address of this block
+ uintptr_t Base = 0;
+ /// Size of the block
+ size_t Size = 0;
+ /// Supported allocation size by this block
+ size_t ChunkSize = 0;
+ /// Total number of slots
+ uint32_t NumSlots = 0;
+ /// Number of slots in use
+ uint32_t NumUsedSlots = 0;
+ /// Cached available slot returned by the last dealloc() call
+ uint32_t FreeSlot = UINT32_MAX;
+ /// Marker for the currently used slots
+ std::vector<bool> UsedSlots;
+
+ BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+ Base = reinterpret_cast<uintptr_t>(_Base);
+ Size = _Size;
+ ChunkSize = _ChunkSize;
+ NumSlots = Size / ChunkSize;
+ NumUsedSlots = 0;
+ UsedSlots.resize(NumSlots, false);
+ }
+
+ /// Check if the current block is fully used
+ bool isFull() const { return NumUsedSlots == NumSlots; }
+
+ /// Check if the given address belongs to the current block
+ bool contains(void *Mem) const {
+ auto M = reinterpret_cast<uintptr_t>(Mem);
+ return M >= Base && M < Base + Size;
+ }
+
+ /// Allocate a single chunk from the block
+ void *alloc();
+
+ /// Deallocate the given memory
+ void dealloc(void *Mem);
+ }; // BlockTy
+
+ /// Allocation kind for the current pool
+ int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+ /// Access to the allocator
+ MemAllocatorTy *Allocator = nullptr;
+ /// Minimum supported memory allocation size from pool
+ size_t AllocMin = 1 << 6; // 64B
+ /// Maximum supported memory allocation size from pool
+ size_t AllocMax = 0;
+ /// Allocation size when the pool needs to allocate a block
+ size_t AllocUnit = 1 << 16; // 64KB
+ /// Capacity of each block in the buckets which decides number of
+ /// allocatable chunks from the block. Each block in the bucket can serve
+ /// at least BlockCapacity chunks.
+ /// If ChunkSize * BlockCapacity <= AllocUnit
+ /// BlockSize = AllocUnit
+ /// Otherwise,
+ /// BlockSize = ChunkSize * BlockCapacity
+ /// This simply means how much memory is over-allocated.
+ uint32_t BlockCapacity = 0;
+ /// Total memory allocated from GPU RT for this pool
+ size_t PoolSize = 0;
+ /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+ /// when PoolSize reaches PoolSizeMax.
+ size_t PoolSizeMax = 0;
+ /// Small allocation size allowed in the pool even if pool size is over the
+ /// pool size limit
+ size_t SmallAllocMax = 1024;
+ /// Small allocation pool size
+ size_t SmallPoolSize = 0;
+ /// Small allocation pool size max (4MB)
+ size_t SmallPoolSizeMax = (4 << 20);
+ /// List of buckets
+ std::vector<std::vector<BlockTy *>> Buckets;
+ /// List of bucket parameters
+ std::vector<std::pair<size_t, size_t>> BucketParams;
+ /// Map from allocated pointer to corresponding block.
+ std::unordered_map<void *, BlockTy *> PtrToBlock;
+ /// Simple stats counting miss/hit in each bucket.
+ std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+ /// Need to zero-initialize after L0 allocation
+ bool ZeroInit = false;
+ /// Zero-initialized values to be copied to device
+ std::vector<char> ZeroInitValue;
+
+ /// Get bucket ID from the specified allocation size.
+ uint32_t getBucketId(size_t Size) {
+ uint32_t Count = 0;
+ for (size_t SZ = AllocMin; SZ < Size; Count++)
+ SZ <<= 1;
+ return Count;
+ }
+
+ public:
+ MemPoolTy() = default;
+
+ /// Construct pool with allocation kind, allocator, and user options.
+ MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option);
+ // Used for reduction pool
+ MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+ // Used for small memory pool with fixed parameters
+ MemPoolTy(MemAllocatorTy *_Allocator);
+
+ MemPoolTy(const MemPoolTy &) = delete;
+ MemPoolTy(MemPoolTy &&) = delete;
+ MemPoolTy &operator=(const MemPoolTy &) = delete;
+ MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+ void printUsage();
+ /// Release resources used in the pool.
+ ~MemPoolTy();
+
+ /// Allocate the requested size of memory from this pool.
+ /// AllocSize is the chunk size internally used for the returned memory.
+ void *alloc(size_t Size, size_t &AllocSize);
+ /// Deallocate the specified memory and returns block size deallocated.
+ size_t dealloc(void *Ptr);
+ }; // MemPoolTy
+
+ /// Allocation information maintained in the plugin
+ class MemAllocInfoMapTy {
+ /// Map from allocated pointer to allocation information
+ std::map<void *, MemAllocInfoTy> Map;
+ /// Map from target alloc kind to number of implicit arguments
+ std::map<int32_t, uint32_t> NumImplicitArgs;
+
+ public:
+ /// Add allocation information to the map
+ void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
+ bool InPool = false, bool ImplicitArg = false);
+
+ /// Remove allocation information for the given memory location
+ bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
+
+ /// Finds allocation information for the given memory location
+ const MemAllocInfoTy *find(void *Ptr) const {
+ auto AllocInfo = Map.find(Ptr);
+ if (AllocInfo == Map.end())
+ return nullptr;
+ else
+ return &AllocInfo->second;
+ }
+
+ /// Check if the map contains the given pointer and offset
+ bool contains(const void *Ptr, size_t Size) const {
+ if (Map.size() == 0)
+ return false;
+ auto I = Map.upper_bound(const_cast<void *>(Ptr));
+ if (I == Map.begin())
+ return false;
+ --I;
+ bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+ (uintptr_t)Ptr + (uintptr_t)Size <=
+ (uintptr_t)I->first + (uintptr_t)I->second.Size;
+ return Ret;
+ }
+
+ /// Returns the number of implicit arguments for the specified allocation
+ /// kind.
+ size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+ }; // MemAllocInfoMapTy
+
+ /// L0 context to use
+ const L0ContextTy *L0Context = nullptr;
+ /// L0 device to use
+ L0DeviceTy *Device = nullptr;
+ /// Whether the device supports large memory allocation
+ bool SupportsLargeMem = false;
+ /// Cached max alloc size supported by device
+ uint64_t MaxAllocSize = INT64_MAX;
+ /// Map from allocation kind to memory statistics
+ std::unordered_map<int32_t, MemStatTy> Stats;
+ /// Map from allocation kind to memory pool
+ std::unordered_map<int32_t, MemPoolTy> Pools;
+ /// Memory pool dedicated to reduction scratch space
+ std::unique_ptr<MemPoolTy> ReductionPool;
+ /// Memory pool dedicated to reduction counters
+ std::unique_ptr<MemPoolTy> CounterPool;
+ /// Allocation information map
+ MemAllocInfoMapTy AllocInfo;
+ /// RTL-owned memory that needs to be freed automatically
+ std::list<void *> MemOwned;
+ /// Lock protection
+ std::mutex Mtx;
+ /// Allocator only supports host memory
+ bool IsHostMem = false;
+ // Internal deallocation function to be called when already
+ // hondling the Mtx lock
+ int32_t dealloc_locked(void *Ptr);
+
+public:
+ MemAllocatorTy() = default;
+
+ MemAllocatorTy(const MemAllocatorTy &) = delete;
+ MemAllocatorTy(MemAllocatorTy &&) = delete;
+ MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
+ MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
+
+ /// Release resources and report statistics if requested
+ ~MemAllocatorTy() {
+ if (L0Context)
+ deinit(); // Release resources
+ }
+ void deinit();
+
+ /// Allocator only supports host memory
+ bool supportsHostMem() { return IsHostMem; }
+
+ void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+ void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+ void updateMaxAllocSize(L0DeviceTy &L0Device);
+
+ /// Allocate memory from L0 GPU RT. We use over-allocation workaround
+ /// to support target pointer with offset, and positive "ActiveSize" is
+ /// specified in such cases for correct debug logging.
+ void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+
+ /// Allocate memory with the specified information from a memory pool
+ void *alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+ bool UserAlloc, bool DevMalloc, uint32_t MemAdvice,
+ AllocOptionTy AllocOpt);
+
+ /// Deallocate memory
+ int32_t dealloc(void *Ptr) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ return dealloc_locked(Ptr);
+ }
+
+ /// Check if the given memory location and offset belongs to any allocated
+ /// memory
+ bool contains(const void *Ptr, size_t Size) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ return AllocInfo.contains(Ptr, Size);
+ }
+
+ /// Get allocation information for the specified memory location
+ const MemAllocInfoTy *getAllocInfo(void *Ptr) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ return AllocInfo.find(Ptr);
+ }
+
+ /// Get kernel indirect access flags using implicit argument info
+ ze_kernel_indirect_access_flags_t getIndirectFlags() {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ ze_kernel_indirect_access_flags_t Ret = 0;
+ if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
+ Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
+ if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
+ Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
+ if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
+ Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+ return Ret;
+ }
+
+ /// Log memory allocation/deallocation
+ void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+ if (Stats.count(Kind) == 0)
+ return; // Stat is disabled
+
+ auto &ST = Stats[Kind];
+ int32_t I = Pool ? 1 : 0;
+ if (ReqSize > 0) {
+ ST.Requested[I] += ReqSize;
+ ST.Allocated[I] += Size;
+ ST.InUse[I] += Size;
+ ST.NumAllocs[I]++;
+ } else {
+ ST.Freed[I] += Size;
+ ST.InUse[I] -= Size;
+ }
+ ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+ }
+
+ /// Perform copy operation
+ int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+}; /// MemAllocatorTy
+
+// simple generic wrapper to reuse objects
+// objects must have zero argument accessible constructor
+template <class ObjTy> class ObjPool {
+ // Protection
+ std::unique_ptr<std::mutex> Mtx;
+ // List of Objects
+ std::list<ObjTy *> Objects;
+
+public:
+ ObjPool() { Mtx.reset(new std::mutex); }
+
+ ObjPool(const ObjPool &) = delete;
+ ObjPool(ObjPool &) = delete;
+ ObjPool &operator=(const ObjPool &) = delete;
+ ObjPool &operator=(const ObjPool &&) = delete;
+
+ ObjTy *get() {
+ if (!Objects.empty()) {
+ std::lock_guard<std::mutex> Lock(*Mtx);
+ if (!Objects.empty()) {
+ const auto Ret = Objects.back();
+ Objects.pop_back();
+ return Ret;
+ }
+ }
+ return new ObjTy();
+ }
+
+ void release(ObjTy *obj) {
+ std::lock_guard<std::mutex> Lock(*Mtx);
+ Objects.push_back(obj);
+ }
+
+ ~ObjPool() {
+ for (auto object : Objects)
+ delete object;
+ }
+};
+
+/// Common event pool used in the plugin. This event pool assumes all events
+/// from the pool are host-visible and use the same event pool flag.
+class EventPoolTy {
+ /// Size of L0 event pool created on demand
+ size_t PoolSize = 64;
+
+ /// Context of the events
+ ze_context_handle_t Context = nullptr;
+
+ /// Additional event pool flags common to this pull
+ uint32_t Flags = 0;
+
+ /// Protection
+ std::unique_ptr<std::mutex> Mtx;
+
+ /// List of created L0 event pools
+ std::list<ze_event_pool_handle_t> Pools;
+
+ /// List of free L0 events
+ std::list<ze_event_handle_t> Events;
+
+#ifdef OMPT_SUPPORT
+ /// Event to OMPT record map. The timestamp information is recorded to the
+ /// OMPT record before the event is recycled.
+ std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
+#endif // OMPT_SUPPORT
+
+public:
+ /// Initialize context, flags, and mutex
+ void init(ze_context_handle_t _Context, uint32_t _Flags) {
+ Context = _Context;
+ Flags = _Flags;
+ Mtx.reset(new std::mutex);
+ }
+
+ /// Destroys L0 resources
+ void deinit() {
+ for (auto E : Events)
+ CALL_ZE_RET_VOID(zeEventDestroy, E);
+ for (auto P : Pools)
+ CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+ }
+
+ /// Get a free event from the pool
+ ze_event_handle_t getEvent();
+
+ /// Return an event to the pool
+ void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+};
+
+/// Staging buffer
+/// A single staging buffer is not enough when batching is enabled since there
+/// can be multiple pending copy operations.
+class StagingBufferTy {
+ /// Context for L0 calls
+ ze_context_handle_t Context = nullptr;
+ /// Max allowed size for staging buffer
+ size_t Size = L0StagingBufferSize;
+ /// Number of buffers allocated together
+ size_t Count = L0StagingBufferCount;
+ /// Buffers increasing by Count if a new buffer is required
+ std::list<void *> Buffers;
+ /// Next buffer location in the buffers
+ size_t Offset = 0;
+
+ void *addBuffers() {
+ ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+ nullptr, 0};
+ void *Ret = nullptr;
+ size_t AllocSize = Size * Count;
+ CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+ L0Alignment, &Ret);
+ Buffers.push_back(Ret);
+ return Ret;
+ }
+
+public:
+ StagingBufferTy() = default;
+ StagingBufferTy(const StagingBufferTy &) = delete;
+ StagingBufferTy(StagingBufferTy &&) = delete;
+ StagingBufferTy &operator=(const StagingBufferTy &) = delete;
+ StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
+
+ ~StagingBufferTy() {
+ if (initialized())
+ clear();
+ }
+
+ void clear() {
+ ze_result_t Rc;
+ (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
+ for (auto Ptr : Buffers)
+ CALL_ZE(Rc, zeMemFree, Context, Ptr);
+ Context = nullptr;
+ }
+
+ bool initialized() const { return Context != nullptr; }
+
+ void init(ze_context_handle_t _Context, size_t _Size, size_t _Count) {
+ Context = _Context;
+ Size = _Size;
+ Count = _Count;
+ }
+
+ void reset() { Offset = 0; }
+
+ /// Always return the first buffer
+ void *get() {
+ if (Size == 0 || Count == 0)
+ return nullptr;
+ return Buffers.empty() ? addBuffers() : Buffers.front();
+ }
+
+ /// Return the next available buffer
+ void *getNext() {
+ void *Ret = nullptr;
+ if (Size == 0 || Count == 0)
+ return Ret;
+
+ size_t AllocSize = Size * Count;
+ bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
+ if (NeedToGrow)
+ Ret = addBuffers();
+ else
+ Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+
+ if (!Ret)
+ return nullptr;
+
+ Offset += Size;
+ return Ret;
+ }
+
+ /// Return either a fixed buffer or next buffer
+ void *get(bool Next) { return Next ? getNext() : get(); }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
new file mode 100644
index 0000000000000..b3ecd25f56ddd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+ std::vector<uint32_t> ConstantIds;
+ std::vector<const void *> ConstantValues;
+
+public:
+ SpecConstantsTy() = default;
+ SpecConstantsTy(const SpecConstantsTy &) = delete;
+ SpecConstantsTy(SpecConstantsTy &&) = delete;
+ SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+ SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+ SpecConstantsTy(const SpecConstantsTy &&Other)
+ : ConstantIds(std::move(Other.ConstantIds)),
+ ConstantValues(std::move(Other.ConstantValues)) {}
+
+ ~SpecConstantsTy() {
+ for (auto I : ConstantValues) {
+ const char *ValuePtr = reinterpret_cast<const char *>(I);
+ delete[] ValuePtr;
+ }
+ }
+
+ template <typename T> void addConstant(uint32_t Id, T Val) {
+ const size_t ValSize = sizeof(Val);
+ char *ValuePtr = new char[ValSize];
+ *reinterpret_cast<T *>(ValuePtr) = Val;
+
+ ConstantIds.push_back(Id);
+ ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+ }
+
+ ze_module_constants_t getModuleConstants() const {
+ ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+ ConstantIds.data(),
+ // Unfortunately we have to const_cast it.
+ // L0 data type should probably be fixed.
+ const_cast<const void **>(ConstantValues.data())};
+ return Tmp;
+ }
+};
+#define FIXED static constexpr
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+ uint64_t UseMemoryPool : 1;
+ uint64_t Reserved : 63;
+ L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+ /// Binary flags
+ L0OptionFlagsTy Flags;
+
+ /// Staging buffer size
+ size_t StagingBufferSize = L0StagingBufferSize;
+
+ /// Staging buffer count
+ size_t StagingBufferCount = L0StagingBufferCount;
+
+ // TODO: This should probably be an array indexed by AllocKind
+ /// Memory pool parameters
+ /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+ std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+ {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+ {TARGET_ALLOC_HOST, {1, 4, 256}},
+ {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+ /// Parameters for memory pools dedicated to reduction scratch space
+ std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+ /// Oversubscription rate for normal kernels
+ FIXED uint32_t SubscriptionRate = 4;
+
+ /// Loop kernels with known ND-range may be known to have
+ /// few iterations and they may not exploit the offload device
+ /// to the fullest extent.
+ /// Let's assume a device has N total HW threads available,
+ /// and the kernel requires M hardware threads with LWS set to L.
+ /// If (M < N * ThinThreadsThreshold), then we will try
+ /// to iteratively divide L by 2 to increase the number of HW
+ /// threads used for executing the kernel. Effectively, we will
+ /// end up with L less than the kernel's SIMD width, so the HW
+ /// threads will not use all their SIMD lanes. This (presumably) should
+ /// allow more parallelism, because the stalls in the SIMD lanes
+ /// will be distributed across more HW threads, and the probability
+ /// of having a stall (or a sequence of stalls) on a critical path
+ /// in the kernel should decrease.
+ /// Anyway, this is just a heuristics that seems to work well for some
+ /// kernels (which poorly expose parallelism in the first place).
+ FIXED double ThinThreadsThreshold = 0.1;
+
+ /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
+ /// All the discard filter should be before the accept filter.
+ std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
+
+ /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+ bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
+
+ // Compilation options for IGC
+ // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+ // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+ // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+ // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+ // builtins.
+ std::string CompilationOptions = "-cl-std=CL2.0 ";
+ std::string InternalCompilationOptions = "-cl-take-global-address";
+ std::string UserCompilationOptions = "";
+
+ // Spec constants used for all modules.
+ SpecConstantsTy CommonSpecConstants;
+
+ /// Command execution mode.
+ /// Whether the runtime uses asynchronous mode or not depends on the type of
+ /// devices and whether immediate command list is fully enabled.
+ CommandModeTy CommandMode = CommandModeTy::Async;
+
+ bool Init = false; // have the options already been processed
+
+ /// Read environment variables
+ L0OptionsTy() {}
+
+ void processEnvironmentVars();
+
+ void init() {
+ if (!Init) {
+ processEnvironmentVars();
+ Init = true;
+ }
+ }
+
+ /// Parse the string and split it into tokens of string_views based on the
+ /// Delim character.
+ std::vector<std::string_view> tokenize(const std::string_view &Filter,
+ const std::string &Delim,
+ bool ProhibitEmptyTokens = false);
+
+ bool isDigits(const std::string_view &str) {
+ if (str.size() == 0)
+ return false;
+ return std::all_of(str.begin(), str.end(), ::isdigit);
+ }
+
+ bool match(const std::string &Var, const std::string &Matched) {
+ if (Var.size() != Matched.size())
+ return false;
+
+ auto equals = [](char a, char b) {
+ return std::tolower(a) == std::tolower(b);
+ };
+ return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
+ equals);
+ }
+
+ bool match(const std::string &Var, const char *Matched) {
+ std::string Str(Matched);
+ return match(Var, Str);
+ }
+
+ bool match(const StringEnvar &Var, const char *Matched) {
+ return match(Var.get(), Matched);
+ }
+
+}; // L0OptionsTy
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
new file mode 100644
index 0000000000000..4658c1cdab1df
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -0,0 +1,136 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Plugin interface for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Defs.h"
+#include "L0Device.h"
+#include "L0Memory.h"
+#include "L0Options.h"
+#include "L0Program.h"
+#include "TLS.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Class implementing the LevelZero specific functionalities of the plugin.
+class LevelZeroPluginTy final : public GenericPluginTy {
+private:
+ /// Number of devices available including subdevices
+ uint32_t NumDevices = 0;
+
+ /// Context (and Driver) specific data
+ std::list<L0ContextTy> ContextList;
+
+ /// L0 device used by each OpenMP device
+ using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
+ DeviceContainerTy L0Devices;
+
+ // Table containing per-thread information using TLS
+ L0ThreadTblTy ThreadTLSTable;
+ // Table containing per-thread information for each device using TLS
+ L0DeviceTLSTableTy DeviceTLSTable;
+ // Table containing per-thread information for each Context using TLS
+ L0ContextTLSTableTy ContextTLSTable;
+
+ /// L0 plugin global options
+ static L0OptionsTy Options;
+
+ /// Global mutex
+ std::mutex GlobalMutex;
+
+ /// Common pool of AsyncQueue
+ AsyncQueuePoolTy AsyncQueuePool;
+
+ auto &getTLS() { return ThreadTLSTable.get(); }
+
+public:
+ LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
+ virtual ~LevelZeroPluginTy() {}
+
+ auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
+ auto &getContextTLS(ze_context_handle_t Context) {
+ return ContextTLSTable.get(Context);
+ }
+
+ static const auto &getOptions() { return Options; }
+
+ auto &getGlobalMutex() { return GlobalMutex; }
+
+ struct DevicesRangeTy {
+ using iterator = DeviceContainerTy::iterator;
+
+ iterator BeginIt;
+ iterator EndIt;
+
+ DevicesRangeTy(iterator BeginIt, iterator EndIt)
+ : BeginIt(BeginIt), EndIt(EndIt) {}
+
+ auto &begin() { return BeginIt; }
+ auto &end() { return EndIt; }
+ };
+
+ auto getDevicesRange() {
+ return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
+ }
+
+ /// Clean-up routine to be invoked by the destructor or
+ /// LevelZeroPluginTy::deinit.
+ void closeRTL();
+
+ /// Find L0 devices and initialize device properties.
+ /// Returns number of devices reported to omptarget.
+ int32_t findDevices();
+
+ L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
+ assert("Invalid device ID" && DeviceId >= 0 &&
+ DeviceId < static_cast<int32_t>(L0Devices.size()));
+ return *L0Devices[DeviceId];
+ }
+
+ uint32_t getNumRootDevices() const { return NumDevices; }
+
+ AsyncQueueTy *getAsyncQueue() {
+ auto *Queue = getTLS().getAsyncQueue();
+ if (!Queue)
+ Queue = AsyncQueuePool.get();
+ return Queue;
+ }
+
+ void releaseAsyncQueue(AsyncQueueTy *Queue) {
+ if (!Queue)
+ return;
+ Queue->reset();
+ Queue->InUse = false;
+ if (!getTLS().releaseAsyncQueue(Queue))
+ AsyncQueuePool.release(Queue);
+ }
+
+ // Plugin interface
+
+ Expected<int32_t> initImpl() override;
+ Error deinitImpl() override;
+ GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+ int32_t NumDevices) override;
+ GenericGlobalHandlerTy *createGlobalHandler() override;
+ uint16_t getMagicElfBits() const override;
+ Triple::ArchType getTripleArch() const override;
+ const char *getName() const override;
+ Expected<bool> isELFCompatible(uint32_t DeviceId,
+ StringRef Image) const override;
+
+ Error flushQueueImpl(omp_interop_val_t *Interop) override;
+ Error syncBarrierImpl(omp_interop_val_t *Interop) override;
+ Error asyncBarrierImpl(omp_interop_val_t *Interop) override;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
new file mode 100644
index 0000000000000..a548b486f4642
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -0,0 +1,135 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Kernel.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+/// Program data to be initialized by plugin
+struct ProgramDataTy {
+ int Initialized = 0;
+ int NumDevices = 0;
+ int DeviceNum = -1;
+ uint32_t TotalEUs = 0;
+ uint32_t HWThreadsPerEU = 0;
+ uintptr_t DynamicMemoryLB = 0;
+ uintptr_t DynamicMemoryUB = 0;
+ int DeviceType = 0;
+ void *DynamicMemPool = nullptr;
+ int TeamsThreadLimit = 0;
+};
+
+/// Level Zero program that can contain multiple modules.
+class L0ProgramTy : public DeviceImageTy {
+ /// Handle multiple modules within a single target image
+ llvm::SmallVector<ze_module_handle_t> Modules;
+
+ /// Map of kernel names to Modules
+ std::unordered_map<std::string, ze_module_handle_t> KernelsToModuleMap;
+
+ /// List of kernels built for this image
+ /// We need to delete them ourselves as the main library is not doing
+ /// that right now
+ std::list<L0KernelTy *> Kernels;
+
+ /// Module that contains global data including device RTL
+ ze_module_handle_t GlobalModule = nullptr;
+
+ /// Requires module link
+ bool RequiresModuleLink = false;
+
+ /// Is this module library
+ bool IsLibModule = false;
+
+ /// Build a single module with the given image, build option, and format.
+ int32_t addModule(const size_t Size, const uint8_t *Image,
+ const std::string &BuildOption, ze_module_format_t Format);
+ /// Read file and return the size of the binary if successful.
+ size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
+ int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
+ void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+ std::string &Options) const;
+
+ /// Check if the image should be handled as a library module
+ void setLibModule();
+
+ L0DeviceTy &getL0Device() const;
+
+public:
+ L0ProgramTy() = delete;
+
+ L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
+ const __tgt_device_image *Image)
+ : DeviceImageTy(ImageId, Device, Image) {}
+
+ ~L0ProgramTy();
+
+ L0ProgramTy(const L0ProgramTy &other) = delete;
+ L0ProgramTy(L0ProgramTy &&) = delete;
+ L0ProgramTy &operator=(const L0ProgramTy &) = delete;
+ L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
+
+ static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
+ return static_cast<L0ProgramTy &>(Device);
+ }
+
+ /// Build modules from the target image description
+ int32_t buildModules(std::string &BuildOptions);
+
+ /// Link modules stored in \p Modules.
+ int32_t linkModules();
+
+ /// Loads the kernels names from all modules
+ int32_t loadModuleKernels();
+
+ /// Read data from the location in the device image which corresponds to the
+ /// specified global variable name.
+ int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+
+ /// Write data to the location in the device image which corresponds to the
+ /// specified global variable name.
+ int32_t writeGlobalVariable(const char *Name, size_t Size,
+ const void *HostPtr);
+
+ /// Looks up an OpenMP declare target global variable with the given
+ /// \p Name and \p Size in the device environment for the current device.
+ /// The lookup is first done via the device offload table. If it fails,
+ /// then the lookup falls back to non-OpenMP specific lookup on the device.
+ void *getOffloadVarDeviceAddr(const char *Name) const;
+
+ /// Returns the handle of a module that contains a given Kernel name
+ ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
+ auto K = KernelsToModuleMap.find(std::string(KernelName));
+ if (K == KernelsToModuleMap.end())
+ return nullptr;
+
+ return K->second;
+ }
+
+ void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); }
+};
+
+struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
+ Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+ DeviceImageTy &Image,
+ GlobalTy &DeviceGlobal) override;
+};
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+ uint64_t &MinorVer);
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+ uint64_t &MinorVer);
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
new file mode 100644
index 0000000000000..2eeae81016dee
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -0,0 +1,193 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing L0
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+#pragma once
+
+#include "Shared/Debug.h"
+#include "omptarget.h"
+#include <string>
+#include <level_zero/ze_api.h>
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define DPCALL(...) \
+ do { \
+ if (getDebugLevel() > 1) \
+ DP(__VA_ARGS__); \
+ } while (0)
+
+#define FATAL_ERROR(Msg) \
+ do { \
+ fprintf(stderr, "%s --> ", DEBUG_PREFIX); \
+ fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+#define WARNING(...) \
+ do { \
+ fprintf(stderr, "%s --> ", DEBUG_PREFIX); \
+ fprintf(stderr, "Warning: " __VA_ARGS__); \
+ } while (0)
+
+#define INVALID_OPTION(Name, Value) \
+ WARNING("Ignoring invalid option " #Name "=%s\n", Value)
+
+#define CALL_ZE(Rc, Fn, ...) \
+ do { \
+ Rc = Fn(__VA_ARGS__); \
+ } while (0)
+
+#define CALL_ZE_RC(Rc, Fn, ...) \
+ do { \
+ CALL_ZE(Rc, Fn, __VA_ARGS__); \
+ if (Rc != ZE_RESULT_SUCCESS) { \
+ DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc, \
+ getZeErrorName(Rc)); \
+ } \
+ } while(0)
+
+/// For non-thread-safe functions
+#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...) \
+ do { \
+ Mtx.lock(); \
+ ze_result_t rc; \
+ CALL_ZE(rc, Fn, __VA_ARGS__); \
+ Mtx.unlock(); \
+ if (rc != ZE_RESULT_SUCCESS) { \
+ DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \
+ getZeErrorName(rc)); \
+ return Ret; \
+ } \
+ } while (0)
+
+#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...) \
+ CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...) \
+ CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...) \
+ CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+
+/// For thread-safe functions
+#define CALL_ZE_RET(Ret, Fn, ...) \
+ do { \
+ ze_result_t rc; \
+ CALL_ZE(rc, Fn, __VA_ARGS__); \
+ if (rc != ZE_RESULT_SUCCESS) { \
+ DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \
+ getZeErrorName(rc)); \
+ return Ret; \
+ } \
+ } while (0)
+
+#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR(Fn, ...) \
+ CALL_ZE_RET( \
+ Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s", \
+ STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
+
+
+
+#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...) \
+ do { \
+ ze_result_t rc; \
+ CALL_ZE(rc, Fn, __VA_ARGS__); \
+ if (rc != ZE_RESULT_SUCCESS) { \
+ DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \
+ getZeErrorName(rc)); \
+ const char *err_str = nullptr; \
+ rc = zeDriverGetLastErrorDescription( \
+ Dev.getDriverHandle(), &err_str); \
+ fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn, \
+ err_str); \
+ } \
+ } while (0)
+
+#define CALL_ZE_EXIT_FAIL(Fn, ...) \
+ do { \
+ ze_result_t rc; \
+ CALL_ZE(rc, Fn, __VA_ARGS__); \
+ if (rc != ZE_RESULT_SUCCESS) { \
+ DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \
+ getZeErrorName(rc)); \
+ std::exit(EXIT_FAILURE); \
+ } \
+ } while (0)
+
+#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...) \
+ do { \
+ ze_result_t rc; \
+ CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__); \
+ if (rc != ZE_RESULT_SUCCESS) \
+ return Ret; \
+ } while (0)
+
+
+#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...) \
+ CALL_ZE_EXT_SILENT_RET(Device, \
+ Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s", \
+ STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__)
+
+#define FOREACH_ZE_ERROR_CODE(Fn) \
+ Fn(ZE_RESULT_SUCCESS) \
+ Fn(ZE_RESULT_NOT_READY) \
+ Fn(ZE_RESULT_ERROR_DEVICE_LOST) \
+ Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY) \
+ Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) \
+ Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) \
+ Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE) \
+ Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET) \
+ Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE) \
+ Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS) \
+ Fn(ZE_RESULT_ERROR_NOT_AVAILABLE) \
+ Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE) \
+ Fn(ZE_RESULT_WARNING_DROPPED_DATA) \
+ Fn(ZE_RESULT_ERROR_UNINITIALIZED) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) \
+ Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT) \
+ Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE) \
+ Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE) \
+ Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER) \
+ Fn(ZE_RESULT_ERROR_INVALID_SIZE) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT) \
+ Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT) \
+ Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION) \
+ Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) \
+ Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY) \
+ Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME) \
+ Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME) \
+ Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) \
+ Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION) \
+ Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION) \
+ Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX) \
+ Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE) \
+ Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE) \
+ Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED) \
+ Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE) \
+ Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS) \
+ Fn(ZE_RESULT_WARNING_ACTION_REQUIRED) \
+ Fn(ZE_RESULT_ERROR_UNKNOWN)
+
+#define CASE_TO_STRING(Num) case Num: return #Num;
+inline const char *getZeErrorName(int32_t Error) {
+ switch (Error) {
+ FOREACH_ZE_ERROR_CODE(CASE_TO_STRING)
+ default:
+ return "ZE_RESULT_ERROR_UNKNOWN";
+ }
+}
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
new file mode 100644
index 0000000000000..8a5f41312e129
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -0,0 +1,86 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thread Level Storage abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Memory.h"
+#include "L0Trace.h"
+#include "PerThreadTable.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// All thread-local data used by the Plugin
+class L0ThreadTLSTy {
+ /// Subdevice encoding
+ int64_t SubDeviceCode = 0;
+
+ /// Async info tracking
+ static constexpr int32_t PerThreadQueues = 10;
+ AsyncQueueTy AsyncQueues[PerThreadQueues];
+ int32_t UsedQueues = 0;
+
+public:
+ L0ThreadTLSTy() = default;
+ L0ThreadTLSTy(const L0ThreadTLSTy &) = delete;
+ L0ThreadTLSTy(L0ThreadTLSTy &&) = delete;
+ L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete;
+ L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete;
+ ~L0ThreadTLSTy() {}
+
+ void clear() {}
+
+ int64_t getSubDeviceCode() { return SubDeviceCode; }
+
+ void setSubDeviceCode(int64_t Code) { SubDeviceCode = Code; }
+
+ AsyncQueueTy *getAsyncQueue() {
+ AsyncQueueTy *ret = nullptr;
+ if (UsedQueues < PerThreadQueues) {
+ // there's a free queue in this thread, find it
+ for (int32_t q = 0; q < PerThreadQueues; q++) {
+ if (!AsyncQueues[q].InUse) {
+ UsedQueues++;
+ ret = &AsyncQueues[q];
+ break;
+ }
+ }
+ assert(ret && "A queue should have been found!");
+ ret->InUse = true;
+ }
+ return ret;
+ }
+
+ bool releaseAsyncQueue(AsyncQueueTy *queue) {
+ if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) {
+ // it's a local queue
+ queue->InUse = false;
+ UsedQueues--;
+ return true;
+ }
+ return false;
+ }
+};
+
+struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
+ void clear() {
+ PerThread::clear([](auto &Entry) { Entry.clear(); });
+ }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
new file mode 100644
index 0000000000000..3f50ffd2a7260
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -0,0 +1,41 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Context.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+ int32_t /*DriverId*/)
+ : Plugin(Plugin), zeDriver(zeDriver) {
+ CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+ DP("Driver API version is %" PRIx32 "\n", APIVersion);
+
+ ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+ CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
+
+ EventPool.init(zeContext, 0);
+ HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+}
+
+StagingBufferTy &L0ContextTy::getStagingBuffer() {
+ auto &TLS = Plugin.getContextTLS(getZeContext());
+ auto &Buffer = TLS.getStagingBuffer();
+ const auto &Options = Plugin.getOptions();
+ if (!Buffer.initialized())
+ Buffer.init(getZeContext(), Options.StagingBufferSize,
+ Options.StagingBufferCount);
+ return Buffer;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
new file mode 100644
index 0000000000000..0029d00a07685
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -0,0 +1,1065 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Device.h"
+#include "L0Defs.h"
+#include "L0Interop.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+L0DeviceTLSTy &L0DeviceTy::getTLS() {
+ return getPlugin().getDeviceTLS(getDeviceId());
+}
+
+// clang-format off
+/// Mapping from device arch to GPU runtime's device identifiers
+static struct {
+ DeviceArchTy arch;
+ PCIIdTy ids[10];
+} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen,
+ {PCIIdTy::SKL,
+ PCIIdTy::KBL,
+ PCIIdTy::CFL, PCIIdTy::CFL_2,
+ PCIIdTy::ICX,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_Gen,
+ {PCIIdTy::TGL, PCIIdTy::TGL_2,
+ PCIIdTy::DG1,
+ PCIIdTy::RKL,
+ PCIIdTy::ADLS,
+ PCIIdTy::RTL,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_XeLPG,
+ {PCIIdTy::MTL,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_XeHPC,
+ {PCIIdTy::PVC,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_XeHPG,
+ {PCIIdTy::DG2_ATS_M,
+ PCIIdTy::DG2_ATS_M_2,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_Xe2LP,
+ {PCIIdTy::LNL,
+ PCIIdTy::None}},
+ {DeviceArchTy::DeviceArch_Xe2HP,
+ {PCIIdTy::BMG,
+ PCIIdTy::None}},
+};
+constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]);
+// clang-format on
+
+DeviceArchTy L0DeviceTy::computeArch() const {
+ const auto PCIDeviceId = getPCIId();
+ if (PCIDeviceId != 0) {
+ for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+ for (int i = 0;; i++) {
+ const auto Id = DeviceArchMap[arch].ids[i];
+ if (Id == PCIIdTy::None)
+ break;
+
+ auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
+ if (maskedId == Id)
+ return DeviceArchMap[arch].arch; // Exact match or prefix match
+ }
+ }
+ }
+
+ DP("Warning: Cannot decide device arch for %s.\n", getNameCStr());
+ return DeviceArchTy::DeviceArch_None;
+}
+
+bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const {
+ ze_device_ip_version_ext_t IPVersion{};
+ IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+ IPVersion.pNext = nullptr;
+ ze_device_properties_t DevicePR{};
+ DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ DevicePR.pNext = &IPVersion;
+ CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR);
+ return IPVersion.ipVersion >= Version;
+}
+
+/// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findComputeOrdinal() {
+ std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+ uint32_t Count = 0;
+ const auto zeDevice = getZeDevice();
+ CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ nullptr);
+ ze_command_queue_group_properties_t Init{
+ ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+ std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+ CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ Properties.data());
+ for (uint32_t I = 0; I < Count; I++) {
+ // TODO: add a separate set of ordinals for compute queue groups which
+ // support cooperative kernels
+ if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+ Ordinal.first = I;
+ Ordinal.second = Properties[I].numQueues;
+ break;
+ }
+ }
+ if (Ordinal.first == UINT32_MAX)
+ DP("Error: no command queues are found\n");
+
+ return Ordinal;
+}
+
+/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
+ std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+ uint32_t Count = 0;
+ const auto zeDevice = getZeDevice();
+ CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ nullptr);
+ ze_command_queue_group_properties_t Init{
+ ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+ std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+ CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+ Properties.data());
+
+ for (uint32_t I = 0; I < Count; I++) {
+ const auto &Flags = Properties[I].flags;
+ if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+ (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) {
+ auto NumQueues = Properties[I].numQueues;
+ if (LinkCopy && NumQueues > 1) {
+ Ordinal = {I, NumQueues};
+ DP("Found link copy command queue for device " DPxMOD
+ ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n",
+ DPxPTR(zeDevice), Ordinal.first, Ordinal.second);
+ break;
+ } else if (!LinkCopy && NumQueues == 1) {
+ Ordinal = {I, NumQueues};
+ DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32
+ "\n",
+ DPxPTR(zeDevice), Ordinal.first);
+ break;
+ }
+ }
+ }
+ return Ordinal;
+}
+
+void L0DeviceTy::reportDeviceInfo() const {
+ DP("Device %" PRIu32 "\n", DeviceId);
+ DP("-- Name : %s\n", getNameCStr());
+ DP("-- PCI ID : 0x%" PRIx32 "\n", getPCIId());
+ DP("-- UUID : %s\n", getUuid().c_str());
+ DP("-- Number of total EUs : %" PRIu32 "\n", getNumEUs());
+ DP("-- Number of threads per EU : %" PRIu32 "\n", getNumThreadsPerEU());
+ DP("-- EU SIMD width : %" PRIu32 "\n", getSIMDWidth());
+ DP("-- Number of EUs per subslice : %" PRIu32 "\n", getNumEUsPerSubslice());
+ DP("-- Number of subslices per slice: %" PRIu32 "\n",
+ getNumSubslicesPerSlice());
+ DP("-- Number of slices : %" PRIu32 "\n", getNumSlices());
+ DP("-- Local memory size (bytes) : %" PRIu32 "\n",
+ getMaxSharedLocalMemory());
+ DP("-- Global memory size (bytes) : %" PRIu64 "\n", getGlobalMemorySize());
+ DP("-- Cache size (bytes) : %" PRIu64 "\n", getCacheSize());
+ DP("-- Max clock frequency (MHz) : %" PRIu32 "\n", getClockRate());
+}
+
+Error L0DeviceTy::internalInit() {
+ const auto &Options = getPlugin().getOptions();
+
+ uint32_t Count = 1;
+ const auto zeDevice = getZeDevice();
+ CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties);
+ CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties);
+ CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count,
+ &MemoryProperties);
+ CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count,
+ &CacheProperties);
+
+ DeviceName =
+ std::string(DeviceProperties.name, sizeof(DeviceProperties.name));
+
+ DP("Found a GPU device, Name = %s\n", DeviceProperties.name);
+
+ DeviceArch = computeArch();
+ // Default allocation kind for this device
+ AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED;
+
+ ze_kernel_indirect_access_flags_t Flags =
+ (AllocKind == TARGET_ALLOC_DEVICE)
+ ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
+ : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+ IndirectAccessFlags = Flags;
+
+ // Get the UUID
+ std::string uid = "";
+ for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++)
+ uid += std::to_string(DeviceProperties.uuid.id[n]);
+ DeviceUuid = std::move(uid);
+
+ ComputeOrdinal = findComputeOrdinal();
+
+ CopyOrdinal = findCopyOrdinal();
+
+ LinkCopyOrdinal = findCopyOrdinal(true);
+ IsAsyncEnabled =
+ isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
+ MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+ l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+ return Plugin::success();
+}
+
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
+ return Plugin::success();
+}
+
+int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+ bool ReleaseQueue) {
+ bool IsAsync = AsyncInfo && asyncEnabled();
+ if (!IsAsync)
+ return OFFLOAD_SUCCESS;
+
+ auto &Plugin = getPlugin();
+
+ AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+
+ if (!AsyncQueue->WaitEvents.empty()) {
+ const auto &WaitEvents = AsyncQueue->WaitEvents;
+ if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
+ // Only need to wait for the last event
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+ // Synchronize on kernel event to support printf()
+ auto KE = AsyncQueue->KernelEvent;
+ if (KE && KE != WaitEvents.back()) {
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+ }
+ for (auto &Event : WaitEvents) {
+ releaseEvent(Event);
+ }
+ } else { // Async
+ // Wait for all events. We should wait and reset events in reverse order
+ // to avoid premature event reset. If we have a kernel event in the
+ // queue, it is the last event to wait for since all wait events of the
+ // kernel are signaled before the kernel is invoked. We always invoke
+ // synchronization on kernel event to support printf().
+ bool WaitDone = false;
+ for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
+ if (!WaitDone) {
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+ if (*Itr == AsyncQueue->KernelEvent)
+ WaitDone = true;
+ }
+ releaseEvent(*Itr);
+ }
+ }
+ }
+
+ // Commit delayed USM2M copies
+ for (auto &USM2M : AsyncQueue->USM2MList) {
+ std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+ std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+ }
+ // Commit delayed H2M copies
+ for (auto &H2M : AsyncQueue->H2MList) {
+ std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+ static_cast<char *>(std::get<1>(H2M)));
+ }
+ if (ReleaseQueue) {
+ Plugin.releaseAsyncQueue(AsyncQueue);
+ getStagingBuffer().reset();
+ AsyncInfo->Queue = nullptr;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo) {
+ if (Size == 0)
+ return OFFLOAD_SUCCESS;
+
+ auto &Plugin = getPlugin();
+
+ const auto DeviceId = getDeviceId();
+ bool IsAsync = AsyncInfo && asyncEnabled();
+ if (IsAsync && !AsyncInfo->Queue) {
+ AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+ if (!AsyncInfo->Queue)
+ IsAsync = false; // Couldn't get a queue, revert to sync
+ }
+ const auto TgtPtrType = getMemAllocType(TgtPtr);
+ if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
+ TgtPtrType == ZE_MEMORY_TYPE_HOST) {
+ std::copy_n(static_cast<const char *>(HstPtr), Size,
+ static_cast<char *>(TgtPtr));
+ } else {
+ const void *SrcPtr = HstPtr;
+ if (isDiscreteDevice() &&
+ static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
+ getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+ SrcPtr = getStagingBuffer().get(IsAsync);
+ std::copy_n(static_cast<const char *>(HstPtr), Size,
+ static_cast<char *>(const_cast<void *>(SrcPtr)));
+ }
+ int32_t RC;
+ if (IsAsync)
+ RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
+ else
+ RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
+ if (RC != OFFLOAD_SUCCESS)
+ return RC;
+ }
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+ IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
+ DPxPTR(TgtPtr));
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfo) {
+ if (Size == 0)
+ return OFFLOAD_SUCCESS;
+
+ auto &Plugin = getPlugin();
+ const auto DeviceId = getDeviceId();
+ bool IsAsync = AsyncInfo && asyncEnabled();
+ if (IsAsync && !AsyncInfo->Queue) {
+ AsyncInfo->Queue = Plugin.getAsyncQueue();
+ if (!AsyncInfo->Queue)
+ IsAsync = false; // Couldn't get a queue, revert to sync
+ }
+ auto AsyncQueue =
+ IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
+ auto TgtPtrType = getMemAllocType(TgtPtr);
+ if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
+ TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
+ bool CopyNow = true;
+ if (IsAsync) {
+ if (AsyncQueue->KernelEvent) {
+ // Delay Host/Shared USM to host memory copy since it must wait for
+ // kernel completion.
+ AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size);
+ CopyNow = false;
+ }
+ }
+ if (CopyNow) {
+ std::copy_n(static_cast<const char *>(TgtPtr), Size,
+ static_cast<char *>(HstPtr));
+ }
+ } else {
+ void *DstPtr = HstPtr;
+ if (isDiscreteDevice() &&
+ static_cast<size_t>(Size) <=
+ getPlugin().getOptions().StagingBufferSize &&
+ getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+ DstPtr = getStagingBuffer().get(IsAsync);
+ }
+ int32_t RC;
+ if (IsAsync)
+ RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+ /* CopyTo */ false);
+ else
+ RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
+ if (RC != OFFLOAD_SUCCESS)
+ return RC;
+ if (DstPtr != HstPtr) {
+ if (IsAsync) {
+ // Store delayed H2M data copies
+ auto &H2MList = AsyncQueue->H2MList;
+ H2MList.emplace_back(DstPtr, HstPtr, static_cast<size_t>(Size));
+ } else {
+ std::copy_n(static_cast<char *>(DstPtr), Size,
+ static_cast<char *>(HstPtr));
+ }
+ }
+ }
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+ IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
+ DPxPTR(HstPtr));
+
+ return OFFLOAD_SUCCESS;
+}
+
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
+ int32_t ImageId) {
+ auto *PGM = getProgramFromImage(TgtImage);
+ if (PGM) {
+ // Program already exists
+ return PGM;
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+ "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+ DPxPTR(TgtImage->ImageStart));
+
+ const size_t NumEntries =
+ (size_t)(TgtImage->EntriesEnd - TgtImage->EntriesBegin);
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+ "Expecting to have %zu entries defined\n", NumEntries);
+ (void)NumEntries; // silence warning
+
+ const auto &Options = getPlugin().getOptions();
+ std::string CompilationOptions(Options.CompilationOptions + " " +
+ Options.UserCompilationOptions);
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+ "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+ CompilationOptions += " " + Options.InternalCompilationOptions;
+ auto &Program = addProgram(ImageId, TgtImage);
+
+ int32_t RC = Program.buildModules(CompilationOptions);
+ if (RC != OFFLOAD_SUCCESS)
+ return Plugin::check(RC, "Error in buildModules %d", RC);
+
+ RC = Program.linkModules();
+ if (RC != OFFLOAD_SUCCESS)
+ return Plugin::check(RC, "Error in linkModules %d", RC);
+
+ RC = Program.loadModuleKernels();
+ if (RC != OFFLOAD_SUCCESS)
+ return Plugin::check(RC, "Error in buildKernels %d", RC);
+
+ return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+ // Ignoring for now
+ // TODO: call properly L0Program unload
+ return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+ bool ReleaseQueue) {
+ if (!ReleaseQueue) {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "Support for ReleaseQueue=false in %s"
+ " not implemented yet\n",
+ __func__);
+ }
+ int32_t RC = synchronize(&AsyncInfo);
+ return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+ const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+ if (!IsAsync)
+ return false;
+
+ auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+ if (AsyncQueue->WaitEvents.empty())
+ return false;
+
+ return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+ const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+ if (!IsAsync)
+ return Plugin::success();
+
+ auto &Plugin = getPlugin();
+ auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+ if (!AsyncQueue->WaitEvents.empty())
+ return Plugin::success();
+
+ // Commit delayed USM2M copies
+ for (auto &USM2M : AsyncQueue->USM2MList) {
+ std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+ std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+ }
+ // Commit delayed H2M copies
+ for (auto &H2M : AsyncQueue->H2MList) {
+ std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+ static_cast<char *>(std::get<1>(H2M)));
+ }
+ Plugin.releaseAsyncQueue(AsyncQueue);
+ getStagingBuffer().reset();
+ AsyncInfo.Queue = nullptr;
+
+ return Plugin::success();
+}
+
+void *L0DeviceTy::allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) {
+ return dataAlloc(Size, /*Align=*/0, Kind,
+ /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+ /*DevMalloc=*/false);
+}
+
+int L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+ return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+ return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
+}
+
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+ int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+ return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
+}
+
+Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+ void *DstPtr, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+
+ L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev);
+ // Use copy engine only for across-tile/device copies.
+ const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
+
+ if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
+ if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+ (__tgt_async_info *)AsyncInfoWrapper))
+ return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+ } else {
+ if (enqueueMemCopy(DstPtr, SrcPtr, Size,
+ /* AsyncInfo */ nullptr,
+ /* Locked */ false, UseCopyEngine))
+ return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+ }
+ return Plugin::success();
+}
+
+Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs<AsyncQueueTy *>();
+ if (!Queue) {
+ Queue = getPlugin().getAsyncQueue();
+ AsyncInfoWrapper.setQueueAs<AsyncQueueTy *>(Queue);
+ }
+ return Plugin::success();
+}
+
+Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
+ if (!Info->Context)
+ Info->Context = getZeContext();
+ if (!Info->Device)
+ Info->Device = reinterpret_cast<void *>(getZeDevice());
+ return Plugin::success();
+}
+
+Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
+ InfoTreeNode Info;
+ Info.add("Device Number", getDeviceId());
+ Info.add("Device Name", getNameCStr());
+ Info.add("Device PCI ID", getPCIId());
+ Info.add("Device UUID", getUuid().c_str());
+ Info.add("Number of total EUs", getNumEUs());
+ Info.add("Number of threads per EU", getNumThreadsPerEU());
+ Info.add("EU SIMD width", getSIMDWidth());
+ Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
+ Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
+ Info.add("Number of slices", getNumSlices());
+ Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
+ Info.add("Global memory size (bytes)", getGlobalMemorySize());
+ Info.add("Cache size (bytes)", getCacheSize());
+ Info.add("Max clock frequency (MHz)", getClockRate());
+ return Info;
+}
+
+Expected<GenericKernelTy &> L0DeviceTy::constructKernel(const char *Name) {
+ // Allocate and construct the L0 kernel.
+ L0KernelTy *L0Kernel = getPlugin().allocate<L0KernelTy>();
+ if (!L0Kernel)
+ return Plugin::error(ErrorCode::UNKNOWN,
+ "Failed to allocate memory for L0 kernel");
+
+ new (L0Kernel) L0KernelTy(Name);
+
+ return *L0Kernel;
+}
+
+uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
+ ze_memory_allocation_properties_t properties = {
+ ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+ nullptr, // extension
+ ZE_MEMORY_TYPE_UNKNOWN, // type
+ 0, // id
+ 0, // page size
+ };
+
+ ze_result_t rc;
+ CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties,
+ nullptr);
+
+ if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT)
+ return ZE_MEMORY_TYPE_UNKNOWN;
+ else
+ return properties.type;
+}
+
+interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
+ int32_t NumPrefers,
+ interop_spec_t *Prefers) {
+ // no supported preference found, set default to level_zero, non-ordered
+ return interop_spec_t{
+ tgt_fr_level_zero, {forceInorderInterop() /*inorder*/, 0}, 0};
+}
+
+Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
+ interop_spec_t &InteropSpec) {
+ auto Ret =
+ new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext);
+ Ret->fr_id = tgt_fr_level_zero;
+ Ret->vendor_id = omp_vendor_intel;
+
+ if (InteropContext == kmp_interop_type_target ||
+ InteropContext == kmp_interop_type_targetsync) {
+ Ret->device_info.Platform = getZeDriver();
+ Ret->device_info.Device = getZeDevice();
+ Ret->device_info.Context = getZeContext();
+ }
+
+ Ret->rtl_property = new L0Interop::Property();
+ if (InteropContext == kmp_interop_type_targetsync) {
+ Ret->async_info = new __tgt_async_info();
+ auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
+
+ bool InOrder = InteropSpec.attrs.inorder;
+ Ret->attrs.inorder = InOrder;
+ if (useImmForInterop()) {
+ auto CmdList = createImmCmdList(InOrder);
+ Ret->async_info->Queue = CmdList;
+ L0->ImmCmdList = CmdList;
+ } else {
+ Ret->async_info->Queue = createCommandQueue(InOrder);
+ L0->CommandQueue =
+ static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
+ }
+ }
+
+ return Ret;
+}
+
+Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
+ const auto DeviceId = getDeviceId();
+
+ if (!Interop || Interop->device_id != (intptr_t)DeviceId) {
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+ DPxPTR(Interop));
+ }
+ auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+ if (Interop->async_info && Interop->async_info->Queue) {
+ if (useImmForInterop()) {
+ auto ImmCmdList = L0->ImmCmdList;
+ CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList);
+ } else {
+ auto CmdQueue = L0->CommandQueue;
+ CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue);
+ }
+ }
+ delete L0;
+ delete Interop;
+
+ return Plugin::success();
+}
+
+int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+ __tgt_async_info *AsyncInfo, bool Locked,
+ bool UseCopyEngine) {
+ ze_command_list_handle_t CmdList = nullptr;
+ ze_command_queue_handle_t CmdQueue = nullptr;
+ ze_event_handle_t Event = nullptr;
+
+ if (useImmForCopy()) {
+ CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+ Event = getEvent();
+ CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+ Event, 0, nullptr);
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+ } else {
+ if (UseCopyEngine) {
+ CmdList = getCopyCmdList();
+ CmdQueue = getCopyCmdQueue();
+ } else {
+ CmdList = getCmdList();
+ CmdQueue = getCmdQueue();
+ }
+
+ CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+ Event, 0, nullptr);
+ CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+ if (Locked) {
+ CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+ nullptr);
+ } else {
+ CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+ CmdQueue, 1, &CmdList, nullptr);
+ }
+ CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+ CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue non-blocking memory copy. This function is invoked only when IMM is
+/// fully enabled and async mode is requested.
+int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+ __tgt_async_info *AsyncInfo,
+ bool CopyTo) {
+ const bool Ordered =
+ (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+ ze_event_handle_t SignalEvent = getEvent();
+ size_t NumWaitEvents = 0;
+ ze_event_handle_t *WaitEvents = nullptr;
+ AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
+ if (!AsyncQueue->WaitEvents.empty()) {
+ // Use a single wait event if events are ordered or a kernel event exists.
+ NumWaitEvents = 1;
+ if (Ordered)
+ WaitEvents = &AsyncQueue->WaitEvents.back();
+ else if (AsyncQueue->KernelEvent)
+ WaitEvents = &AsyncQueue->KernelEvent;
+ else
+ NumWaitEvents = 0;
+ }
+ auto CmdList = getImmCopyCmdList();
+ CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+ SignalEvent, NumWaitEvents, WaitEvents);
+ AsyncQueue->WaitEvents.push_back(SignalEvent);
+ return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue memory fill
+int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+ size_t PatternSize, size_t Size) {
+ if (useImmForCopy()) {
+ const auto CmdList = getImmCopyCmdList();
+ auto Event = getEvent();
+ CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+ PatternSize, Size, Event, 0, nullptr);
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+ } else {
+ auto CmdList = getCopyCmdList();
+ const auto CmdQueue = getCopyCmdQueue();
+ CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+ PatternSize, Size, nullptr, 0, nullptr);
+ CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+ CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+ nullptr);
+ CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+ CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
+ int64_t PatternSize, int64_t Size,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ // TODO: support async version
+ // TODO: convert enqueueMemFill to return Error code
+ if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
+ return Plugin::success();
+
+ return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+}
+
+void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+ intptr_t Offset, bool UserAlloc, bool DevMalloc,
+ uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+
+ const bool UseDedicatedPool =
+ (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
+ (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+ if (Kind == TARGET_ALLOC_DEFAULT) {
+ if (UserAlloc)
+ Kind = TARGET_ALLOC_DEVICE;
+ else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM)
+ Kind = TARGET_ALLOC_HOST;
+ else if (UseDedicatedPool)
+ Kind = TARGET_ALLOC_DEVICE;
+ else
+ Kind = getAllocKind();
+ }
+ auto &Allocator = getMemAllocator(Kind);
+ return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+ MemAdvice, AllocOpt);
+}
+
+int32_t L0DeviceTy::dataDelete(void *Ptr) {
+ auto &Allocator = getMemAllocator(Ptr);
+ return Allocator.dealloc(Ptr);
+}
+
+int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+ ze_result_t RC;
+ CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
+ Size);
+ if (RC != ZE_RESULT_SUCCESS) {
+ DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
+ ".\n",
+ DPxPTR(Mem), DPxPTR(getZeDevice()));
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+// Command queues related functions
+/// Create a command list with given ordinal and flags
+ze_command_list_handle_t L0DeviceTy::createCmdList(
+ ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+ ze_command_list_flags_t Flags, const std::string &DeviceIdStr) {
+ ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+ nullptr, // extension
+ Ordinal, Flags};
+ ze_command_list_handle_t cmdList;
+ CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
+ &cmdList);
+ DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
+ ") for device %s.\n",
+ DPxPTR(cmdList), Ordinal, DeviceIdStr.c_str());
+ return cmdList;
+}
+
+/// Create a command list with default flags
+ze_command_list_handle_t
+L0DeviceTy::createCmdList(ze_context_handle_t Context,
+ ze_device_handle_t Device, uint32_t Ordinal,
+ const std::string &DeviceIdStr) {
+ return (Ordinal == UINT32_MAX)
+ ? nullptr
+ : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
+}
+
+ze_command_list_handle_t L0DeviceTy::getCmdList() {
+ auto &TLS = getTLS();
+ auto CmdList = TLS.getCmdList();
+ if (!CmdList) {
+ CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
+ getZeId());
+ TLS.setCmdList(CmdList);
+ }
+ return CmdList;
+}
+
+/// Create a command queue with given ordinal and flags
+ze_command_queue_handle_t
+L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
+ ze_device_handle_t Device, uint32_t Ordinal,
+ uint32_t Index, ze_command_queue_flags_t Flags,
+ const std::string &DeviceIdStr) {
+ ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+ nullptr, // extension
+ Ordinal,
+ Index,
+ Flags, // flags
+ ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+ ze_command_queue_handle_t cmdQueue;
+ CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+ &cmdQueue);
+ DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
+ ", Flags: %" PRIu32 ") for device %s.\n",
+ DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.c_str());
+ return cmdQueue;
+}
+
+/// Create a command queue with default flags
+ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+ ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+ uint32_t Index, const std::string &DeviceIdStr, bool InOrder) {
+ ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+ return (Ordinal == UINT32_MAX) ? nullptr
+ : createCmdQueue(Context, Device, Ordinal,
+ Index, Flags, DeviceIdStr);
+}
+
+/// Create a new command queue for the given OpenMP device ID
+ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+ auto cmdQueue =
+ createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
+ getComputeIndex(), getZeId(), InOrder);
+ return cmdQueue;
+}
+
+/// Create an immediate command list
+ze_command_list_handle_t
+L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
+ ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+ ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+ nullptr,
+ Ordinal,
+ Index,
+ Flags,
+ ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+ ze_command_list_handle_t CmdList = nullptr;
+ CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+ &Desc, &CmdList);
+ DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
+ ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
+ DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
+ return CmdList;
+}
+
+/// Create an immediate command list for copying
+ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+ uint32_t Ordinal = getMainCopyEngine();
+ if (Ordinal == UINT32_MAX)
+ Ordinal = getLinkCopyEngine();
+ if (Ordinal == UINT32_MAX)
+ Ordinal = getComputeEngine();
+ return createImmCmdList(Ordinal, /*Index*/ 0);
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+ auto &TLS = getTLS();
+ auto CmdQueue = TLS.getCmdQueue();
+ if (!CmdQueue) {
+ CmdQueue = createCommandQueue();
+ TLS.setCmdQueue(CmdQueue);
+ }
+ return CmdQueue;
+}
+
+ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+ // Use main copy engine if available
+ if (hasMainCopyEngine()) {
+ auto &TLS = getTLS();
+ auto CmdList = TLS.getCopyCmdList();
+ if (!CmdList) {
+ CmdList = createCmdList(getZeContext(), getZeDevice(),
+ getMainCopyEngine(), getZeId());
+ TLS.setCopyCmdList(CmdList);
+ }
+ return CmdList;
+ }
+ // Use link copy engine if available
+ if (hasLinkCopyEngine())
+ return getLinkCopyCmdList();
+ // Use compute engine otherwise
+ return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+ // Use main copy engine if available
+ if (hasMainCopyEngine()) {
+ auto &TLS = getTLS();
+ auto CmdQueue = TLS.getCopyCmdQueue();
+ if (!CmdQueue) {
+ CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
+ getMainCopyEngine(), 0, getZeId());
+ TLS.setCopyCmdQueue(CmdQueue);
+ }
+ return CmdQueue;
+ }
+ // Use link copy engine if available
+ if (hasLinkCopyEngine())
+ return getLinkCopyCmdQueue();
+ // Use compute engine otherwise
+ return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+ // Use link copy engine if available
+ if (hasLinkCopyEngine()) {
+ auto &TLS = getTLS();
+ auto CmdList = TLS.getLinkCopyCmdList();
+ if (!CmdList) {
+ CmdList =
+ createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+ ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+ TLS.setLinkCopyCmdList(CmdList);
+ }
+ return CmdList;
+ }
+ // Use main copy engine if available
+ if (hasMainCopyEngine())
+ return getCopyCmdList();
+ // Use compute engine otherwise
+ return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+ // Use link copy engine if available
+ if (hasLinkCopyEngine()) {
+ auto &TLS = getTLS();
+ auto CmdQueue = TLS.getLinkCopyCmdQueue();
+ if (!CmdQueue) {
+ // Try to use different copy engines for multiple threads
+ uint32_t Index =
+ __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
+ CmdQueue =
+ createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+ Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+ TLS.setLinkCopyCmdQueue(CmdQueue);
+ }
+ return CmdQueue;
+ }
+ // Use main copy engine if available
+ if (hasMainCopyEngine())
+ return getCopyCmdQueue();
+ // Use compute engine otherwise
+ return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+ auto &TLS = getTLS();
+ auto CmdList = TLS.getImmCmdList();
+ if (!CmdList) {
+ CmdList = createImmCmdList();
+ TLS.setImmCmdList(CmdList);
+ }
+ return CmdList;
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+ auto &TLS = getTLS();
+ auto CmdList = TLS.getImmCopyCmdList();
+ if (!CmdList) {
+ CmdList = createImmCopyCmdList();
+ TLS.setImmCopyCmdList(CmdList);
+ }
+ return CmdList;
+}
+
+Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
+ const bool Ordered =
+ (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+
+ // Nothing to do if everything is ordered
+ if (Ordered)
+ return Plugin::success();
+
+ ze_command_list_handle_t CmdList = nullptr;
+ ze_command_queue_handle_t CmdQueue = nullptr;
+
+ if (useImmForCopy()) {
+ CmdList = getImmCopyCmdList();
+ CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+ } else {
+ CmdList = getCopyCmdList();
+ CmdQueue = getCopyCmdQueue();
+ CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+ CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+ CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+ nullptr);
+ CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+ }
+
+ return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
new file mode 100644
index 0000000000000..06f01f23285fc
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -0,0 +1,134 @@
+//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===//
+//
+// Implement wrapper for level_zero API calls through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <memory>
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(zeInit, 1)
+DLWRAP(zeDriverGet, 2)
+DLWRAP(zeDeviceGet, 3)
+DLWRAP(zeDeviceGetSubDevices, 3)
+DLWRAP(zeModuleCreate, 5)
+DLWRAP(zeModuleGetProperties, 2)
+DLWRAP(zeModuleBuildLogDestroy, 1)
+DLWRAP(zeModuleBuildLogGetString, 3)
+DLWRAP(zeModuleGetKernelNames, 3)
+DLWRAP(zeModuleDestroy, 1)
+DLWRAP(zeCommandListAppendBarrier, 4)
+DLWRAP(zeCommandListAppendLaunchKernel, 6)
+DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6)
+DLWRAP(zeCommandListAppendMemoryCopy, 7)
+DLWRAP(zeCommandListAppendMemoryCopyRegion, 12)
+DLWRAP(zeCommandListAppendMemoryFill, 8)
+DLWRAP(zeCommandListAppendMemoryPrefetch, 3)
+DLWRAP(zeCommandListAppendMemAdvise, 5)
+DLWRAP(zeCommandListClose, 1)
+DLWRAP(zeCommandListCreate, 4)
+DLWRAP(zeCommandListCreateImmediate, 4)
+DLWRAP(zeCommandListDestroy, 1)
+DLWRAP(zeCommandListReset, 1)
+DLWRAP(zeCommandQueueCreate, 4)
+DLWRAP(zeCommandQueueDestroy, 1)
+DLWRAP(zeCommandQueueExecuteCommandLists, 4)
+DLWRAP(zeCommandQueueSynchronize, 2)
+DLWRAP(zeContextCreate, 3)
+DLWRAP(zeContextDestroy, 1)
+DLWRAP(zeContextMakeMemoryResident, 4)
+DLWRAP(zeDeviceCanAccessPeer, 3)
+DLWRAP(zeDeviceGetProperties, 2)
+DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3)
+DLWRAP(zeDeviceGetComputeProperties, 2)
+DLWRAP(zeDeviceGetMemoryProperties, 3)
+DLWRAP(zeDeviceGetCacheProperties, 3)
+DLWRAP(zeDeviceGetGlobalTimestamps, 3)
+DLWRAP(zeDriverGetApiVersion, 2)
+DLWRAP(zeDriverGetExtensionFunctionAddress, 3)
+DLWRAP(zeDriverGetExtensionProperties, 3)
+DLWRAP(zeEventCreate, 3)
+DLWRAP(zeEventDestroy, 1)
+DLWRAP(zeEventHostReset, 1)
+DLWRAP(zeEventHostSynchronize, 2)
+DLWRAP(zeEventPoolCreate, 5)
+DLWRAP(zeEventPoolDestroy, 1)
+DLWRAP(zeEventQueryKernelTimestamp, 2)
+DLWRAP(zeFenceCreate, 3)
+DLWRAP(zeFenceDestroy, 1)
+DLWRAP(zeFenceHostSynchronize, 2)
+DLWRAP(zeKernelCreate, 3)
+DLWRAP(zeKernelDestroy, 1)
+DLWRAP(zeKernelGetName, 3)
+DLWRAP(zeKernelGetProperties, 2)
+DLWRAP(zeKernelSetArgumentValue, 4)
+DLWRAP(zeKernelSetGroupSize, 4)
+DLWRAP(zeKernelSetIndirectAccess, 2)
+DLWRAP(zeKernelSuggestGroupSize, 7)
+DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2)
+DLWRAP(zeMemAllocDevice, 6)
+DLWRAP(zeMemAllocHost, 5)
+DLWRAP(zeMemAllocShared, 7)
+DLWRAP(zeMemFree, 2)
+DLWRAP(zeMemGetAddressRange, 4)
+DLWRAP(zeMemGetAllocProperties, 4)
+DLWRAP(zeModuleDynamicLink, 3)
+DLWRAP(zeModuleGetGlobalPointer, 4)
+DLWRAP(zesDeviceEnumMemoryModules, 3)
+DLWRAP(zesMemoryGetState, 2)
+
+DLWRAP_FINALIZE()
+
+#ifndef LEVEL_ZERO_LIBRARY
+#error "Level zero library not defined"
+#endif
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadLevelZero() {
+ const char *L0Library = LEVEL_ZERO_LIBRARY;
+ std::string ErrMsg;
+
+ DP("Trying to load %s\n", L0Library);
+ auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+ llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg));
+ if (!DynlibHandle->isValid()) {
+ if (ErrMsg.empty())
+ ErrMsg = "unknown error";
+ DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str());
+ return false;
+ }
+
+ for (size_t I = 0; I < dlwrap::size(); I++) {
+ const char *Sym = dlwrap::symbol(I);
+
+ void *P = DynlibHandle->getAddressOfSymbol(Sym);
+ if (P == nullptr) {
+ DP("Unable to find '%s' in '%s'!\n", Sym, L0Library);
+ return false;
+ }
+ DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+ *dlwrap::pointer(I) = P;
+ }
+
+ return true;
+}
+
+ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) {
+ if (!loadLevelZero())
+ return ZE_RESULT_ERROR_UNKNOWN;
+ return dlwrap_zeInit(flags);
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
new file mode 100644
index 0000000000000..d1cb0b7bd50bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+ uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+ auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+ int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+ std::move(LaunchParams), AsyncInfoWrapper);
+ if (RC == OFFLOAD_SUCCESS)
+ return Plugin::success();
+ return Plugin::error(error::ErrorCode::UNKNOWN,
+ "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+ const auto *KernelName = getName();
+
+ auto Module = Program.findModuleFromKernelName(KernelName);
+ ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+ KernelName};
+ CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+ return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+ DeviceImageTy &Image) {
+ auto &Program = L0ProgramTy::makeL0Program(Image);
+
+ Error Err = buildKernel(Program);
+ if (Err)
+ return Err;
+ Program.addKernel(this);
+
+ return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+ int ThrLimit;
+ ThrLimit = omp_get_teams_thread_limit();
+ DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+ // omp_get_thread_limit() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t ThreadLimit =
+ (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+ ? ThrLimit
+ : 0;
+
+ int NTeams = omp_get_max_teams();
+ DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+ // omp_get_max_teams() would return INT_MAX by default.
+ // NOTE: Windows.h defines max() macro, so we have to guard
+ // the call with parentheses.
+ int32_t NumTeams =
+ (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+ : 0;
+
+ return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+ TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool IsTeamsNDRange) const {
+
+ const KernelPropertiesTy &KernelPR = getProperties();
+
+ const auto DeviceId = Device.getDeviceId();
+ bool MaxGroupSizeForced = false;
+ bool MaxGroupCountForced = false;
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+ const auto &Option = LevelZeroPluginTy::getOptions();
+ const auto OptSubscRate = Option.SubscriptionRate;
+
+ uint32_t SIMDWidth = KernelPR.SIMDWidth;
+ uint32_t KernelWidth = KernelPR.Width;
+ uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+ if (KernelMaxThreadGroupSize < MaxGroupSize) {
+ MaxGroupSize = KernelMaxThreadGroupSize;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Capping maximum team size to %" PRIu32
+ " due to kernel constraints.\n",
+ MaxGroupSize);
+ }
+
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t MaxGroupCount = 0;
+ if (NumTeams > 0) {
+ MaxGroupCount = NumTeams;
+ MaxGroupCountForced = true;
+ }
+
+ if (MaxGroupCountForced) {
+ // If number of teams is specified by the user, then use KernelWidth
+ // WIs per WG by default, so that it matches
+ // decideLoopKernelGroupArguments() behavior.
+ if (!MaxGroupSizeForced) {
+ MaxGroupSize = KernelWidth;
+ }
+ } else {
+ const uint32_t NumSubslices = Device.getNumSubslices();
+ uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+ if (HalfNumThreads)
+ NumThreadsPerSubslice /= 2;
+
+ MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+ if (MaxGroupSizeForced) {
+ // Set group size for the HW capacity
+ uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+ uint32_t NumGroupsPerSubslice =
+ (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ } else {
+ assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+ assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+ "Invalid maxGroupSize");
+ // Maximize group size
+ while (MaxGroupSize >= KernelWidth) {
+ uint32_t NumThreadsPerGroup =
+ (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+ if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+ uint32_t NumGroupsPerSubslice =
+ NumThreadsPerSubslice / NumThreadsPerGroup;
+ MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+ break;
+ }
+ MaxGroupSize -= KernelWidth;
+ }
+ }
+ }
+
+ uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ bool UsedReductionSubscriptionRate = false;
+ if (!MaxGroupCountForced) {
+ { GRPCounts[0] *= OptSubscRate; }
+
+ size_t LoopTripcount = 0;
+ if (LoopLevels) {
+ // TODO: consider other possible LoopDesc uses
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop desciptor provided but specific ND-range is disabled\n");
+ // TODO: get rid of this constraint
+ if (LoopLevels->NumLoops > 1) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+ LoopLevels->NumLoops);
+ } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+ LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+ LoopLevels->Levels[0].Stride) /
+ LoopLevels->Levels[0].Stride;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+ " = %zu\n",
+ LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+ LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+ LoopTripcount);
+ }
+ }
+
+ if (LoopTripcount && !UsedReductionSubscriptionRate) {
+ const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+ Device.getNumSubslices() * SIMDWidth;
+ size_t AdjustedGroupCount =
+ IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+ MaxTotalThreads / GRPSizes[0])
+ : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+ AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+ AdjustedGroupCount *= OptSubscRate;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Adjusting number of teams using the loop tripcount\n");
+ if (AdjustedGroupCount < GRPCounts[0])
+ GRPCounts[0] = AdjustedGroupCount;
+ }
+ }
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+ const uint32_t (&GroupSizes)[3],
+ uint32_t SIMDWidth) {
+ uint64_t GroupCount[3];
+ for (int I = 0; I < 3; ++I) {
+ if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[I] =
+ (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+ if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+ return (std::numeric_limits<uint64_t>::max)();
+ }
+ for (int I = 1; I < 3; ++I) {
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+ return (std::numeric_limits<uint64_t>::max)();
+ GroupCount[0] *= GroupCount[I];
+ }
+ // Multiplication of the group sizes must never overflow uint64_t
+ // for any existing device.
+ uint64_t LocalWorkSize =
+ uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+ uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+ // Check that the total number of threads fits uint64_t.
+ if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+ return (std::numeric_limits<uint64_t>::max)();
+
+ return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+ L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+ uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+ bool &AllowCooperative) const {
+
+ const auto DeviceId = Device.getDeviceId();
+ const auto &Options = LevelZeroPluginTy::getOptions();
+ const auto &KernelPR = getProperties();
+ uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+ bool MaxGroupSizeForced = false;
+ if (ThreadLimit > 0) {
+ MaxGroupSizeForced = true;
+ MaxGroupSize = ThreadLimit;
+ }
+
+ uint32_t GRPCounts[3] = {1, 1, 1};
+ uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+ TgtLoopDescTy *Levels = LoopLevels->Levels;
+ int32_t DistributeDim = LoopLevels->DistributeDim;
+ assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+ "Invalid distribute dimension.");
+ int32_t NumLoops = LoopLevels->NumLoops;
+ assert((NumLoops > 0 && NumLoops <= 3) &&
+ "Invalid loop nest description for ND partitioning");
+
+ // Compute global widths for X/Y/Z dimensions.
+ size_t TripCounts[3] = {1, 1, 1};
+
+ for (int32_t I = 0; I < NumLoops; I++) {
+ assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+ ", Stride = %" PRId64 "\n",
+ I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+ if (Levels[I].Ub < Levels[I].Lb)
+ TripCounts[I] = 0;
+ else
+ TripCounts[I] =
+ (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+ }
+
+ // Check if any of the loop has zero iterations.
+ if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+ std::fill(GroupSizes, GroupSizes + 3, 1);
+ std::fill(GRPCounts, GRPCounts + 3, 1);
+ if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+ // There is a distribute dimension, and the distribute loop
+ // has non-zero iterations, but some inner parallel loop
+ // has zero iterations. We still want to split the distribute
+ // loop's iterations between many WGs (of size 1), but the inner/lower
+ // dimensions should be 1x1.
+ // Note that this code is currently dead, because we are not
+ // hoisting the inner loops' bounds outside of the target regions.
+ // The code is here just for completeness.
+ size_t DistributeTripCount = TripCounts[DistributeDim];
+ if (DistributeTripCount > UINT32_MAX) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Invalid number of teams %zu due to large loop trip count\n",
+ DistributeTripCount);
+ return OFFLOAD_FAIL;
+ }
+ GRPCounts[DistributeDim] = DistributeTripCount;
+ }
+ AllowCooperative = false;
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ return OFFLOAD_SUCCESS;
+ }
+
+ if (!MaxGroupSizeForced) {
+ // Use zeKernelSuggestGroupSize to compute group sizes,
+ // or fallback to setting dimension 0 width to SIMDWidth.
+ // Note that in case of user-specified LWS GRPSizes[0]
+ // is already set according to the specified value.
+ size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+ if (DistributeDim > 0) {
+ // There is a distribute dimension.
+ GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+ GlobalSizes[DistributeDim] = 1;
+ }
+
+ {
+ if (MaxGroupSize > KernelPR.Width) {
+ GRPSizes[0] = KernelPR.Width;
+ }
+ if (DistributeDim == 0) {
+ // If there is a distribute dimension, then we do not use
+ // thin HW threads, since we do not know anything about
+ // the iteration space of the inner parallel loop regions.
+ //
+ // If there is no distribute dimension, then try to use thiner
+ // HW threads to get more independent HW threads executing
+ // the kernel - this may allow more parallelism due to
+ // the stalls being distributed across multiple HW threads rather
+ // than across SIMD lanes within one HW thread.
+ assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+ "Unexpected team sizes for dimensions 1 or/and 2.");
+ uint32_t SimdWidth = KernelPR.SIMDWidth;
+ uint64_t TotalThreads = Device.getTotalThreads();
+ TotalThreads *= Options.ThinThreadsThreshold;
+
+ uint32_t GRPSizePrev = GRPSizes[0];
+ uint64_t ThreadsNeeded =
+ computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+ while (ThreadsNeeded < TotalThreads) {
+ GRPSizePrev = GRPSizes[0];
+ // Try to half the local work size (if possible) and see
+ // how many HW threads the kernel will require with this
+ // new local work size.
+ // In most implementations the initial GRPSizes[0]
+ // will be a power-of-two.
+ if (GRPSizes[0] <= 1)
+ break;
+ GRPSizes[0] >>= 1;
+ ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+ }
+ GRPSizes[0] = GRPSizePrev;
+ }
+ }
+ }
+
+ for (int32_t I = 0; I < NumLoops; I++) {
+ if (I < DistributeDim) {
+ GRPCounts[I] = 1;
+ continue;
+ }
+ size_t Trip = TripCounts[I];
+ if (GRPSizes[I] >= Trip)
+ GRPSizes[I] = Trip;
+ size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+ if (Count > UINT32_MAX) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Invalid number of teams %zu due to large loop trip count\n", Count);
+ return OFFLOAD_FAIL;
+ }
+ GRPCounts[I] = (uint32_t)Count;
+ }
+ AllowCooperative = false;
+ GroupCounts.groupCountX = GRPCounts[0];
+ GroupCounts.groupCountY = GRPCounts[1];
+ GroupCounts.groupCountZ = GRPCounts[2];
+ std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+ int32_t ThreadLimit, uint32_t *GroupSizes,
+ ze_group_count_t &GroupCounts,
+ void *LoopDesc,
+ bool &AllowCooperative) const {
+
+ const auto SubId = SubDevice.getDeviceId();
+ const auto &KernelPR = getProperties();
+
+ // Detect if we need to reduce available HW threads. We need this adjustment
+ // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+ static std::once_flag OnceFlag;
+ static bool ZeDebugEnabled = false;
+ std::call_once(OnceFlag, []() {
+ const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
+ if (EnvVal && std::atoi(EnvVal) == 1)
+ ZeDebugEnabled = true;
+ });
+
+ // Read the most recent global thread limit and max teams.
+ auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+
+ bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+ bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+ uint32_t KernelWidth = KernelPR.Width;
+ uint32_t SIMDWidth = KernelPR.SIMDWidth;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+ assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+ if (ThreadLimit > 0) {
+ // use thread_limit clause value default
+ DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+ ThreadLimit);
+ } else if (ThreadLimitICV > 0) {
+ // else use thread-limit-var ICV
+ ThreadLimit = ThreadLimitICV;
+ DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+ }
+
+ size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+ // Set correct max group size if the kernel was compiled with explicit SIMD
+ if (SIMDWidth == 1) {
+ MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+ }
+
+ if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+ MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+ DP("Capping maximum team size to %zu due to kernel constraints.\n",
+ MaxThreadLimit);
+ }
+
+ if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+ ThreadLimit = MaxThreadLimit;
+ DP("Max team size execceds current maximum %zu. Adjusted\n",
+ MaxThreadLimit);
+ }
+ {
+ if (NumTeams > 0) {
+ DP("Number of teams is set to %" PRId32
+ "(num_teams clause or no teams construct)\n",
+ NumTeams);
+ } else if (NumTeamsICV > 0) {
+ // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+ "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+ NumTeams = NumTeamsICV;
+ DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+ NumTeams);
+ }
+
+ bool UseLoopTC = LoopDesc;
+ decideKernelGroupArguments(
+ SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+ UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+ GroupCounts, HalfNumThreads, false);
+ AllowCooperative = false;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+ KernelArgsTy &KernelArgs,
+ KernelLaunchParamsTy LaunchParams,
+ __tgt_async_info *AsyncInfo) const {
+ // Libomptarget can pass negative NumTeams and ThreadLimit now after
+ // introducing __tgt_target_kernel. This happens only when we have valid
+ // LoopDesc and the region is not a teams region.
+
+ auto zeKernel = getZeKernel();
+ auto DeviceId = l0Device.getDeviceId();
+ int32_t NumArgs = KernelArgs.NumArgs;
+ int32_t NumTeams = KernelArgs.NumTeams[0];
+ int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+ void *LoopDesc = nullptr;
+
+ if (NumTeams < 0)
+ NumTeams = 0;
+ if (ThreadLimit < 0)
+ ThreadLimit = 0;
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+ auto &Plugin = l0Device.getPlugin();
+ auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+ auto *IdStr = Device.getZeIdCStr();
+ auto &Options = LevelZeroPluginTy::getOptions();
+ bool IsAsync = AsyncInfo && Device.asyncEnabled();
+ if (IsAsync && !AsyncInfo->Queue) {
+ AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+ if (!AsyncInfo->Queue)
+ IsAsync = false; // Couldn't get a queue, revert to sync
+ }
+ auto *AsyncQueue =
+ IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+ // We need to get a non-const version of the Properties structure in order to
+ // use its lock and be able to cache the group params and indirect flags
+ auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+ // Protect from kernel preparation to submission as kernels are shared.
+ std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+ // Decide group sizes and counts
+ uint32_t GroupSizes[3];
+ ze_group_count_t GroupCounts;
+
+ bool AllowCooperative = false;
+
+ // Check if we can reuse previous group parameters
+ bool GroupParamsReused = KernelPR.reuseGroupParams(
+ static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+ GroupSizes, GroupCounts, AllowCooperative);
+
+ if (!GroupParamsReused) {
+ auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+ GroupCounts, LoopDesc, AllowCooperative);
+
+ if (RC != OFFLOAD_SUCCESS) {
+ return RC;
+ }
+
+ KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+ NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+ AllowCooperative);
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+ GroupSizes[1], GroupSizes[2]);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+ GroupCounts.groupCountX, GroupCounts.groupCountY,
+ GroupCounts.groupCountZ);
+ for (int32_t I = 0; I < NumArgs; I++) {
+ {
+ void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+ CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+ Arg == nullptr ? nullptr : &Arg);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+ ") was set successfully for device %s.\n",
+ I, DPxPTR(Arg), IdStr);
+ }
+ }
+
+ // Set Kernel Indirect flags
+ auto &PrevFlags = KernelPR.IndirectAccessFlags;
+ ze_kernel_indirect_access_flags_t Flags = 0;
+ Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+ Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+ if (PrevFlags != Flags) {
+ // Combine with common access flags
+ const auto FinalFlags = Device.getIndirectFlags() | Flags;
+ CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+ DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+ PrevFlags = Flags;
+ }
+
+ if (!GroupParamsReused) {
+ CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+ GroupSizes[1], GroupSizes[2]);
+ }
+
+ ze_command_list_handle_t CmdList = nullptr;
+ ze_command_queue_handle_t CmdQueue = nullptr;
+ const bool UseImmCmdList = Device.useImmForCompute();
+
+ if (UseImmCmdList) {
+ CmdList = Device.getImmCmdList();
+ // Command queue is not used with immediate command list
+ } else {
+ CmdList = Device.getCmdList();
+ CmdQueue = Device.getCmdQueue();
+ }
+
+ if (UseImmCmdList) {
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Using immediate command list for kernel submission.\n");
+ auto Event = Device.getEvent();
+ size_t NumWaitEvents = 0;
+ ze_event_handle_t *WaitEvents = nullptr;
+ if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+ if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+ NumWaitEvents = 1;
+ WaitEvents = &AsyncQueue->WaitEvents.back();
+ } else {
+ NumWaitEvents = AsyncQueue->WaitEvents.size();
+ WaitEvents = AsyncQueue->WaitEvents.data();
+ }
+ }
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+ if (AllowCooperative)
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+ zeKernel, &GroupCounts, Event, NumWaitEvents,
+ WaitEvents);
+ else
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ &GroupCounts, Event, NumWaitEvents, WaitEvents);
+ KernelLock.unlock();
+ if (IsAsync) {
+ AsyncQueue->WaitEvents.push_back(Event);
+ AsyncQueue->KernelEvent = Event;
+ } else {
+ CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+ Device.releaseEvent(Event);
+ }
+ } else {
+ ze_event_handle_t Event = nullptr;
+ KernelLock.unlock();
+ CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+ CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+ CmdQueue, 1, &CmdList, nullptr);
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+ CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+ CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+ if (Event) {
+ Device.releaseEvent(Event);
+ }
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+ IdStr);
+
+ return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
new file mode 100644
index 0000000000000..790acdd9f568f
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -0,0 +1,637 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+ if (isFull())
+ return nullptr;
+ if (FreeSlot != UINT32_MAX) {
+ const uint32_t Slot = FreeSlot;
+ FreeSlot = UINT32_MAX;
+ UsedSlots[Slot] = true;
+ NumUsedSlots++;
+ return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+ }
+ for (uint32_t I = 0; I < NumSlots; I++) {
+ if (UsedSlots[I])
+ continue;
+ UsedSlots[I] = true;
+ NumUsedSlots++;
+ return reinterpret_cast<void *>(Base + I * ChunkSize);
+ }
+ // Should not reach here.
+ assert(0 && "Inconsistent memory pool state");
+ return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+ if (!contains(Mem))
+ assert(0 && "Inconsistent memory pool state");
+ const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+ UsedSlots[Slot] = false;
+ NumUsedSlots--;
+ FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option) {
+ AllocKind = Kind;
+ Allocator = _Allocator;
+
+ // Read user-defined options
+ const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+ const size_t UserAllocMax = UserOptions[0];
+ const size_t UserCapacity = UserOptions[1];
+ const size_t UserPoolSize = UserOptions[2];
+
+ BlockCapacity = UserCapacity;
+ PoolSizeMax = UserPoolSize << 20; // MB to B
+ PoolSize = 0;
+
+ auto Context = Allocator->L0Context->getZeContext();
+ const auto Device = Allocator->Device;
+
+ // Check page size used for this allocation kind to decide minimum
+ // allocation size when allocating from L0.
+ void *Mem = Allocator->allocL0(8, 0, AllocKind);
+ ze_memory_allocation_properties_t AP{
+ ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+ ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+ CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+ AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+ CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+ bool IsDiscrete = false;
+ if (Device) {
+ ze_device_properties_t Properties{};
+ Properties.deviceId = 0;
+ Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ Properties.pNext = nullptr;
+ CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+ IsDiscrete = Device->isDiscreteDevice();
+
+ if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+ // Use page size as minimum chunk size for USM shared on discrete
+ // device.
+ // FIXME: pageSize is not returned correctly (=0) on some new devices,
+ // so use fallback value for now.
+ AllocMin = (std::max)(AP.pageSize, AllocUnit);
+ AllocUnit = AllocMin * BlockCapacity;
+ }
+ }
+
+ // Convert MB to B and round up to power of 2
+ AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+ if (AllocMin >= AllocMax) {
+ AllocMax = 2 * AllocMin;
+ DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+ "requirements.\n",
+ AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+ }
+ assert(AllocMin < AllocMax &&
+ "Invalid parameters while initializing memory pool");
+ const auto MinSize = getBucketId(AllocMin);
+ const auto MaxSize = getBucketId(AllocMax);
+ Buckets.resize(MaxSize - MinSize + 1);
+ BucketStats.resize(Buckets.size(), {0, 0});
+
+ // Set bucket parameters
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const size_t ChunkSize = AllocMin << I;
+ size_t BlockSize = ChunkSize * BlockCapacity;
+ // On discrete device, the cost of native L0 invocation doubles when the
+ // the requested size doubles after certain threshold, so allocating
+ // larger block does not pay off at all. It is better to keep a single
+ // chunk in a single block in such cases.
+ if (BlockSize <= AllocUnit) {
+ BlockSize = AllocUnit; // Allocation unit is already large enough
+ } else if (IsDiscrete) {
+ // Do not preallocate if it does not pay off
+ if (ChunkSize >= L0UsmPreAllocThreshold ||
+ (AllocKind == TARGET_ALLOC_HOST &&
+ ChunkSize >= L0HostUsmPreAllocThreshold))
+ BlockSize = ChunkSize;
+ }
+ BucketParams.emplace_back(ChunkSize, BlockSize);
+ }
+
+ DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+ "AllocMax = %zu, "
+ "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+ ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+ BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+ const L0OptionsTy &Option) {
+ AllocKind = TARGET_ALLOC_DEVICE;
+ Allocator = _Allocator;
+ AllocMin = AllocUnit = 1024 << 6; // 64KB
+ AllocMax = Option.ReductionPoolInfo[0] << 20;
+ BlockCapacity = Option.ReductionPoolInfo[1];
+ PoolSize = 0;
+ PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+ const auto MinSize = getBucketId(AllocMin);
+ const auto MaxSize = getBucketId(AllocMax);
+ Buckets.resize(MaxSize - MinSize + 1);
+ BucketStats.resize(Buckets.size(), {0, 0});
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const size_t ChunkSize = AllocMin << I;
+ BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+ }
+
+ DP("Initialized reduction scratch pool for device " DPxMOD
+ ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+ DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+ AllocKind = TARGET_ALLOC_DEVICE;
+ Allocator = _Allocator;
+ AllocMax = AllocMin;
+ BlockCapacity = AllocUnit / AllocMax;
+ PoolSize = 0;
+ PoolSizeMax = (1 << 20); // this should be sufficiently large
+ Buckets.resize(1);
+ BucketStats.resize(1, {0, 0});
+ BucketParams.emplace_back(AllocMax, AllocUnit);
+ ZeroInit = true;
+ ZeroInitValue.resize(AllocUnit, 0);
+ DP("Initialized zero-initialized reduction counter pool for "
+ "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+ DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+ auto PrintNum = [](uint64_t Num) {
+ if (Num > 1e9)
+ fprintf(stderr, "%11.2e", float(Num));
+ else
+ fprintf(stderr, "%11" PRIu64, Num);
+ };
+
+ bool HasPoolAlloc = false;
+ for (auto &Stat : BucketStats) {
+ if (Stat.first > 0 || Stat.second > 0) {
+ HasPoolAlloc = true;
+ break;
+ }
+ }
+
+ DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+ DPxPTR(Allocator->Device));
+
+ if (HasPoolAlloc) {
+ DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+ AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+ DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+ for (size_t I = 0; I < Buckets.size(); I++) {
+ const auto &Stat = BucketStats[I];
+ if (Stat.first > 0 || Stat.second > 0) {
+ DP("-- Bucket[%10zu]:", BucketParams[I].first);
+ PrintNum(Stat.first);
+ PrintNum(Stat.second);
+ fprintf(stderr, "%11.2f\n",
+ float(Stat.second) / float(Stat.first + Stat.second) * 100);
+ }
+ }
+ } else {
+ DP("-- Not used\n");
+ }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+ const int DebugLevel = getDebugLevel();
+ if (DebugLevel > 0)
+ printUsage();
+ for (auto &Bucket : Buckets) {
+ for (auto *Block : Bucket) {
+ if (DebugLevel > 0)
+ Allocator->log(0, Block->Size, AllocKind);
+ CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+ reinterpret_cast<void *>(Block->Base));
+ delete Block;
+ }
+ }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+ if (Size == 0 || Size > AllocMax)
+ return nullptr;
+
+ const uint32_t BucketId = getBucketId(Size);
+ auto &Blocks = Buckets[BucketId];
+ void *Mem = nullptr;
+
+ for (auto *Block : Blocks) {
+ if (Block->isFull())
+ continue;
+ Mem = Block->alloc();
+ assert(Mem && "Inconsistent state while allocating memory from pool");
+ PtrToBlock.emplace(Mem, Block);
+ break;
+ }
+
+ if (Mem == nullptr) {
+ const bool IsSmallAllocatable =
+ (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+ const bool IsFull = (PoolSize > PoolSizeMax);
+ if (IsFull && !IsSmallAllocatable)
+ return nullptr;
+ // Bucket is empty or all blocks in the bucket are full
+ const auto ChunkSize = BucketParams[BucketId].first;
+ const auto BlockSize = BucketParams[BucketId].second;
+ void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+ if (ZeroInit) {
+ auto RC =
+ Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
+ if (RC != OFFLOAD_SUCCESS) {
+ DP("Failed to zero-initialize pool memory\n");
+ return nullptr;
+ }
+ }
+
+ BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
+ Blocks.push_back(Block);
+ Mem = Block->alloc();
+ PtrToBlock.emplace(Mem, Block);
+ if (IsFull)
+ SmallPoolSize += BlockSize;
+ else
+ PoolSize += BlockSize;
+ DP("New block allocation for %s pool: base = " DPxMOD
+ ", size = %zu, pool size = %zu\n",
+ ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+ BucketStats[BucketId].first++;
+ } else {
+ BucketStats[BucketId].second++;
+ }
+
+ AllocSize = (AllocMin << BucketId);
+
+ return Mem;
+}
+
+/// Deallocate the specified memory and returns block size deallocated.
+size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
+ if (PtrToBlock.count(Ptr) == 0)
+ return 0;
+ PtrToBlock[Ptr]->dealloc(Ptr);
+ const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize;
+ PtrToBlock.erase(Ptr);
+ return Deallocated;
+}
+
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
+ int32_t Kind, bool InPool,
+ bool ImplicitArg) {
+ const auto Inserted =
+ Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+ // Check if we keep valid disjoint memory ranges.
+ [[maybe_unused]] bool Valid = Inserted.second;
+ if (Valid) {
+ if (Inserted.first != Map.begin()) {
+ const auto I = std::prev(Inserted.first, 1);
+ Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+ }
+ if (Valid) {
+ const auto I = std::next(Inserted.first, 1);
+ if (I != Map.end())
+ Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+ }
+ }
+ assert(Valid && "Invalid overlapping memory allocation");
+ if (ImplicitArg)
+ NumImplicitArgs[Kind]++;
+}
+
+/// Remove allocation information for the given memory location
+bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
+ MemAllocInfoTy *Removed) {
+ const auto AllocInfo = Map.find(Ptr);
+ if (AllocInfo == Map.end())
+ return false;
+ if (AllocInfo->second.ImplicitArg)
+ NumImplicitArgs[AllocInfo->second.Kind]--;
+ if (Removed)
+ *Removed = AllocInfo->second;
+ Map.erase(AllocInfo);
+ return true;
+}
+
+void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+ const L0OptionsTy &Option) {
+ SupportsLargeMem = L0Device.supportsLargeMem();
+ IsHostMem = false;
+ Device = &L0Device;
+ L0Context = &L0Device.getL0Context();
+ for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
+ if (Option.MemPoolInfo.count(Kind) > 0) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+ std::forward_as_tuple(Kind, this, Option));
+ }
+ if (getDebugLevel() > 0)
+ Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+ std::tuple<>{});
+ }
+ ReductionPool = std::make_unique<MemPoolTy>(this, Option);
+ CounterPool = std::make_unique<MemPoolTy>(this);
+ updateMaxAllocSize(L0Device);
+}
+
+void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+ const L0OptionsTy &Option) {
+ SupportsLargeMem = Driver.supportsLargeMem();
+ IsHostMem = true;
+ this->L0Context = &Driver;
+ if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ Pools.emplace(std::piecewise_construct,
+ std::forward_as_tuple(TARGET_ALLOC_HOST),
+ std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+ }
+ if (getDebugLevel() > 0)
+ Stats.emplace(std::piecewise_construct,
+ std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+}
+
+void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
+ // Update the maximum allocation size for this Allocator
+ ze_device_properties_t P;
+ P.maxMemAllocSize = 0;
+ P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+ P.pNext = nullptr;
+ CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+
+ if (IsHostMem) {
+ // MaxAllocSize should be the minimum of all devices from the driver
+ if (MaxAllocSize > P.maxMemAllocSize) {
+ MaxAllocSize = P.maxMemAllocSize;
+ DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
+ DPxPTR(L0Context), MaxAllocSize);
+ }
+ return;
+ }
+
+ MaxAllocSize = P.maxMemAllocSize;
+ DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
+ MaxAllocSize);
+}
+
+/// Release resources and report statistics if requested
+void MemAllocatorTy::deinit() {
+ std::lock_guard<std::mutex> Lock(Mtx);
+ // Release RTL-owned memory
+ for (auto *M : MemOwned)
+ dealloc_locked(M);
+ // Release resources used in the pool
+ Pools.clear();
+ ReductionPool.reset(nullptr);
+ CounterPool.reset(nullptr);
+ // Report memory usage if requested
+ if (getDebugLevel() > 0) {
+ for (auto &Stat : Stats) {
+ DP("Memory usage for %s, device " DPxMOD "\n",
+ ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+ const auto &ST = Stat.second;
+ if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+ DP("-- Not used\n");
+ continue;
+ }
+ DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
+ DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
+ DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
+ DP("-- Freed : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
+ DP("-- InUse : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
+ DP("-- PeakUse : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
+ DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+ }
+ }
+
+ // mark as deinitialized
+ L0Context = nullptr;
+}
+
+/// Allocate memory with the specified information
+void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+ intptr_t Offset, bool UserAlloc, bool DevMalloc,
+ uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+ assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
+ Kind == TARGET_ALLOC_SHARED) &&
+ "Unknown memory kind while allocating target memory");
+
+ std::lock_guard<std::mutex> Lock(Mtx);
+
+ // We do not expect meaningful Align parameter when Offset > 0, so the
+ // following code does not handle such case.
+
+ size_t AllocSize = Size + Offset;
+ void *Mem = nullptr;
+ void *AllocBase = nullptr;
+ const bool UseScratchPool =
+ (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH);
+ const bool UseZeroInitPool =
+ (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+ const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
+
+ if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+ // Pool is enabled for the allocation kind, and we do not use any memory
+ // advice. We should avoid using pool if there is any meaningful memory
+ // advice not to affect sibling allocation in the same block.
+ if (Align > 0)
+ AllocSize += (Align - 1);
+ size_t PoolAllocSize = 0;
+ if (UseScratchPool)
+ AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+ else if (UseZeroInitPool)
+ AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+ else
+ AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+ if (AllocBase) {
+ uintptr_t Base = (uintptr_t)AllocBase;
+ if (Align > 0)
+ Base = (Base + Align) & ~(Align - 1);
+ Mem = (void *)(Base + Offset);
+ AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+ log(Size, PoolAllocSize, Kind, true /* Pool */);
+ if (DevMalloc)
+ MemOwned.push_back(AllocBase);
+ if (UseDedicatedPool) {
+ DP("Allocated %zu bytes from %s pool\n", Size,
+ UseScratchPool ? "scratch" : "zero-initialized");
+ }
+ return Mem;
+ }
+ }
+
+ AllocBase = allocL0(AllocSize, Align, Kind, Size);
+ if (AllocBase) {
+ Mem = (void *)((uintptr_t)AllocBase + Offset);
+ AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+ if (DevMalloc)
+ MemOwned.push_back(AllocBase);
+ if (UseDedicatedPool) {
+ // We do not want this happen in general.
+ DP("Allocated %zu bytes from L0 for %s pool\n", Size,
+ UseScratchPool ? "scratch" : "zero-initialized");
+ }
+ }
+ return Mem;
+}
+
+/// Deallocate memory
+int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
+ MemAllocInfoTy Info;
+ if (!AllocInfo.remove(Ptr, &Info)) {
+ DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
+ DPxPTR(Ptr));
+ return OFFLOAD_FAIL;
+ }
+ if (Info.InPool) {
+ size_t DeallocSize = 0;
+ if (Pools.count(Info.Kind) > 0)
+ DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+ if (DeallocSize == 0) {
+ // Try reduction scratch pool
+ DeallocSize = ReductionPool->dealloc(Info.Base);
+ // Try reduction counter pool
+ if (DeallocSize == 0)
+ DeallocSize = CounterPool->dealloc(Info.Base);
+ if (DeallocSize == 0) {
+ DP("Error: Cannot return memory " DPxMOD " to pool\n", DPxPTR(Ptr));
+ return OFFLOAD_FAIL;
+ }
+ }
+ log(0, DeallocSize, Info.Kind, true /* Pool */);
+ return OFFLOAD_SUCCESS;
+ }
+ if (!Info.Base) {
+ DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
+ return OFFLOAD_FAIL;
+ }
+ CALL_ZE_RET_FAIL(zeMemFree, L0Context->getZeContext(), Info.Base);
+ log(0, Info.Size, Info.Kind);
+
+ DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
+ DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
+ size_t Size) {
+ return Device->enqueueMemCopy(Dst, Src, Size);
+}
+
+void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
+ size_t ActiveSize) {
+ void *Mem = nullptr;
+ ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+ nullptr, 0, 0};
+ ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+ nullptr, 0};
+
+ // Use relaxed allocation limit if driver supports
+ ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{
+ ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr,
+ ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE};
+ if (Size > MaxAllocSize && SupportsLargeMem) {
+ DeviceDesc.pNext = &RelaxedDesc;
+ HostDesc.pNext = &RelaxedDesc;
+ }
+
+ auto zeDevice = Device ? Device->getZeDevice() : 0;
+ auto zeContext = L0Context->getZeContext();
+ bool makeResident = false;
+ switch (Kind) {
+ case TARGET_ALLOC_DEVICE:
+ makeResident = true;
+ CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+ zeDevice, &Mem);
+ DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+ break;
+ case TARGET_ALLOC_HOST:
+ CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+ DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+ break;
+ case TARGET_ALLOC_SHARED:
+ CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+ Align, zeDevice, &Mem);
+ DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+ break;
+ default:
+ assert(0 && "Invalid target data allocation kind");
+ }
+
+ size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+ log(LoggedSize, LoggedSize, Kind);
+ if (makeResident) {
+ assert(Device &&
+ "Device is not set for memory allocation. Is this a Device Pool?");
+ if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+ Mem = nullptr;
+ }
+ return Mem;
+}
+
+ze_event_handle_t EventPoolTy::getEvent() {
+ std::lock_guard<std::mutex> Lock(*Mtx);
+
+ if (Events.empty()) {
+ // Need to create a new L0 pool
+ ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0};
+ Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
+ Desc.count = PoolSize;
+ ze_event_pool_handle_t Pool;
+ CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+ Pools.push_back(Pool);
+
+ // Create events
+ ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
+ EventDesc.wait = 0;
+ EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+ for (uint32_t I = 0; I < PoolSize; I++) {
+ EventDesc.index = I;
+ ze_event_handle_t Event;
+ CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+ Events.push_back(Event);
+ }
+ }
+
+ auto Ret = Events.back();
+ Events.pop_back();
+
+ return Ret;
+}
+
+/// Return an event to the pool
+void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+ std::lock_guard<std::mutex> Lock(*Mtx);
+ CALL_ZE_RET_VOID(zeEventHostReset, Event);
+ Events.push_back(Event);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
new file mode 100644
index 0000000000000..3acb2e78927e7
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -0,0 +1,371 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget.h"
+
+#include "L0Defs.h"
+#include "L0Options.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
+ int32_t CCSID) const {
+ if (ExplicitRootDevices.empty())
+ return false;
+ for (const auto &RootDev : ExplicitRootDevices) {
+ const auto ErootID = std::get<1>(RootDev);
+ if (ErootID != -2 && RootID != ErootID)
+ continue;
+ const auto EsubID = std::get<2>(RootDev);
+ if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
+ continue;
+ const auto ECCSID = std::get<3>(RootDev);
+ if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
+ continue;
+ // Check if isDiscard
+ if (!std::get<0>(RootDev))
+ return false;
+ return true;
+ }
+ return false;
+}
+
+/// Read environment variables
+void L0OptionsTy::processEnvironmentVars() {
+ // Compilation options for IGC
+ UserCompilationOptions +=
+ std::string(" ") +
+ StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
+
+ // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
+ const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
+ if (DeviceSelectorVar.isPresent()) {
+ std::string EnvStr(std::move(DeviceSelectorVar.get()));
+ uint32_t numDiscard = 0;
+ std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
+ [](unsigned char C) { return std::tolower(C); });
+
+ std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+ for (const auto &Term : Entries) {
+ bool isDiscard = false;
+ std::vector<std::string_view> Pair = tokenize(Term, ":", true);
+ if (Pair.empty()) {
+ FAILURE_MESSAGE(
+ "Incomplete selector! Pair and device must be specified.\n");
+ } else if (Pair.size() == 1) {
+ FAILURE_MESSAGE("Incomplete selector! Try '%s:*'if all devices "
+ "under the Pair was original intention.\n",
+ Pair[0].data());
+ } else if (Pair.size() > 2) {
+ FAILURE_MESSAGE(
+ "Error parsing selector string \"%s\" Too many colons (:)\n",
+ Term.data());
+ }
+ if (!((Pair[0][0] == '*') ||
+ (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
+ (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+ break;
+ isDiscard = Pair[0][0] == '!';
+ if (isDiscard)
+ numDiscard++;
+ else if (numDiscard > 0)
+ FAILURE_MESSAGE("All negative(discarding) filters must appear after "
+ "all positive(accepting) filters!");
+
+ std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+ for (const auto &TargetStr : Targets) {
+ bool HasDeviceWildCard = false;
+ bool HasSubDeviceWildCard = false;
+ bool DeviceNum = false;
+ std::vector<std::string_view> DeviceSubTuple =
+ tokenize(TargetStr, ".", true);
+ int32_t RootD[3] = {-1, -1, -1};
+ if (DeviceSubTuple.empty()) {
+ FAILURE_MESSAGE(
+ "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
+ "specified.");
+ }
+
+ std::string_view TopDeviceStr = DeviceSubTuple[0];
+ static const std::array<std::string, 7> DeviceStr = {
+ "host", "cpu", "gpu", "acc", "fpga", "*"};
+ auto It =
+ find_if(DeviceStr.begin(), DeviceStr.end(),
+ [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
+ if (It != DeviceStr.end()) {
+ if (TopDeviceStr[0] == '*') {
+ HasDeviceWildCard = true;
+ RootD[0] = -2;
+ } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+ continue;
+ } else {
+ std::string TDS(TopDeviceStr);
+ if (!isDigits(TDS)) {
+ FAILURE_MESSAGE("error parsing device number: %s",
+ DeviceSubTuple[0].data());
+ } else {
+ RootD[0] = std::stoi(TDS);
+ DeviceNum = true;
+ }
+ }
+ if (DeviceSubTuple.size() >= 2) {
+ if (!DeviceNum && !HasDeviceWildCard)
+ FAILURE_MESSAGE("sub-devices can only be requested when parent "
+ "device is specified by number or wildcard, not a "
+ "device type like \'gpu\'");
+ std::string_view SubDeviceStr = DeviceSubTuple[1];
+ if (SubDeviceStr[0] == '*') {
+ HasSubDeviceWildCard = true;
+ RootD[1] = -2;
+ } else {
+ if (HasDeviceWildCard) // subdevice is a number and device is a *
+ FAILURE_MESSAGE(
+ "sub-device can't be requested by number if parent "
+ "device is specified by a wildcard.");
+
+ std::string SDS(SubDeviceStr);
+ if (!isDigits(SDS)) {
+ FAILURE_MESSAGE("error parsing subdevice index: %s",
+ DeviceSubTuple[1].data());
+ } else
+ RootD[1] = std::stoi(SDS);
+ }
+ }
+ if (DeviceSubTuple.size() == 3) {
+ std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+ if (SubSubDeviceStr[0] == '*') {
+ RootD[2] = -2;
+ } else {
+ if (HasSubDeviceWildCard)
+ FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
+ "sub-device before is specified by a wildcard.");
+ std::string SSDS(SubSubDeviceStr);
+ if (!isDigits(SSDS)) {
+ FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
+ DeviceSubTuple[2].data());
+ } else
+ RootD[2] = std::stoi(SSDS);
+ }
+ } else if (DeviceSubTuple.size() > 3) {
+ FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
+ "supported at this time ",
+ TargetStr.data());
+ }
+ if (isDiscard)
+ ExplicitRootDevices.insert(
+ ExplicitRootDevices.begin(),
+ std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+ RootD[1], RootD[2]));
+ else
+ ExplicitRootDevices.push_back(
+ std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+ RootD[1], RootD[2]));
+ }
+ }
+ }
+
+ DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
+ ExplicitRootDevices.size());
+ DP(" (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
+ "-1(ignore)\n");
+ for (auto &T : ExplicitRootDevices) {
+ DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
+ std::get<2>(T), std::get<3>(T));
+ (void)T; // silence warning
+ }
+
+ // Memory pool
+ // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
+ // <Option> := 0 | <PoolInfoList>
+ // <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]
+ // <PoolInfo> := <MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]
+ // <MemType> := all | device | host | shared
+ // <AllocMax> := non-negative integer or empty, max allocation size in
+ // MB (default: 1)
+ // <Capacity> := positive integer or empty, number of allocations from
+ // a single block (default: 4)
+ // <PoolSize> := positive integer or empty, max pool size in MB
+ // (default: 256)
+ const StringEnvar MemoryPoolVar("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL", "");
+ if (MemoryPoolVar.isPresent()) {
+ if (MemoryPoolVar.get() == "0") {
+ Flags.UseMemoryPool = 0;
+ MemPoolInfo.clear();
+ } else {
+ std::istringstream Str(MemoryPoolVar.get());
+ int32_t MemType = -1;
+ int32_t Offset = 0;
+ int32_t Valid = 1;
+ const std::array<int32_t, 3> DefaultValue{1, 4, 256};
+ const int32_t AllMemType = INT32_MAX;
+ std::array<int32_t, 3> AllInfo{1, 4, 256};
+ std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+ for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
+ if (Token == "device") {
+ MemType = TARGET_ALLOC_DEVICE;
+ PoolInfo.emplace(MemType, DefaultValue);
+ Offset = 0;
+ } else if (Token == "host") {
+ MemType = TARGET_ALLOC_HOST;
+ PoolInfo.emplace(MemType, DefaultValue);
+ Offset = 0;
+ } else if (Token == "shared") {
+ MemType = TARGET_ALLOC_SHARED;
+ PoolInfo.emplace(MemType, DefaultValue);
+ Offset = 0;
+ } else if (Token == "all") {
+ MemType = AllMemType;
+ Offset = 0;
+ Valid = 2;
+ } else if (Offset < 3 && MemType >= 0) {
+ int32_t Num = std::atoi(Token.c_str());
+ bool ValidNum = (Num >= 0 && Offset == 0) || (Num > 0 && Offset > 0);
+ if (ValidNum && MemType == AllMemType)
+ AllInfo[Offset++] = Num;
+ else if (ValidNum)
+ PoolInfo[MemType][Offset++] = Num;
+ else if (Token.size() == 0)
+ Offset++;
+ else
+ Valid = 0;
+ } else {
+ Valid = 0;
+ }
+ }
+ if (Valid > 0) {
+ if (Valid == 2) {
+ // "all" is specified -- ignore other inputs
+ if (AllInfo[0] > 0) {
+ MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
+ MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
+ MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+ } else {
+ MemPoolInfo.clear();
+ }
+ } else {
+ // Use user-specified configuration
+ for (auto &I : PoolInfo) {
+ if (I.second[0] > 0)
+ MemPoolInfo[I.first] = I.second;
+ else
+ MemPoolInfo.erase(I.first);
+ }
+ }
+ } else {
+ DP("Ignoring incorrect memory pool configuration "
+ "LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=%s\n",
+ MemoryPoolVar.get().c_str());
+ DP("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>\n");
+ DP(" <Option> := 0 | <PoolInfoList>\n");
+ DP(" <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]\n");
+ DP(" <PoolInfo> := "
+ "<MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]\n");
+ DP(" <MemType> := all | device | host | shared\n");
+ DP(" <AllocMax> := non-negative integer or empty, "
+ "max allocation size in MB (default: 1)\n");
+ DP(" <Capacity> := positive integer or empty, "
+ "number of allocations from a single block (default: 4)\n");
+ DP(" <PoolSize> := positive integer or empty, "
+ "max pool size in MB (default: 256)\n");
+ }
+ }
+ }
+
+ if (StringEnvar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS").isPresent()) {
+ // To match SYCL RT behavior, we just need to check whether
+ // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+ // does not matter.
+ CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+ }
+
+ // LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE=<SizeInKB>
+ const Envar<size_t> StagingBufferSizeVar(
+ "LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE");
+ if (StagingBufferSizeVar.isPresent()) {
+ size_t SizeInKB = StagingBufferSizeVar;
+ if (SizeInKB > (16 << 10)) {
+ SizeInKB = (16 << 10);
+ DP("Staging buffer size is capped at %zu KB\n", SizeInKB);
+ }
+ StagingBufferSize = SizeInKB << 10;
+ }
+
+ // LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE=<Fmt>
+ // <Fmt> := sync | async | async_ordered
+ // sync: perform synchronization after each command
+ // async: perform synchronization when it is required
+ // async_ordered: same as "async", but command is ordered
+ // This option is ignored unless IMM is fully enabled on compute and copy.
+ // On Intel PVC GPU, when used with immediate command lists over Level Zero
+ // backend, a target region may involve multiple command submissions to the
+ // L0 copy queue and compute queue. L0 events are used for each submission
+ // (data transfer of a single item or kernel execution). When "async" is
+ // specified, a) each data transfer to device is submitted with an event.
+ // b) The kernel is submitted next with a dependence on all the previous
+ // data transfer events. The kernel also has an event associated with it.
+ // c) The data transfer from device will be submitted with a dependence on
+ // the kernel event. d) Finally wait on the host for all the events
+ // associated with the data transfer from device.
+ // The env-var also affects any "target update" constructs as well.
+ // The env-var only affects the L0 copy/compute commands issued from a
+ // single target construct execution, not across multiple invocations.
+ const StringEnvar CommandModeVar("LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE");
+ if (CommandModeVar.isPresent()) {
+ if (match(CommandModeVar, "sync"))
+ CommandMode = CommandModeTy::Sync;
+ else if (match(CommandModeVar, "async"))
+ CommandMode = CommandModeTy::Async;
+ else if (match(CommandModeVar, "async_ordered"))
+ CommandMode = CommandModeTy::AsyncOrdered;
+ else
+ INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
+ CommandModeVar.get().c_str());
+ }
+}
+/// Parse String and split into tokens of string_views based on the
+/// Delim character.
+std::vector<std::string_view>
+L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
+ bool ProhibitEmptyTokens) {
+ std::vector<std::string_view> Tokens;
+ size_t Pos = 0;
+ size_t LastPos = 0;
+ while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
+ std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
+
+ if (!Tok.empty()) {
+ Tokens.push_back(Tok);
+ } else if (ProhibitEmptyTokens) {
+ FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
+ "before '%s'delimiter is not allowed.",
+ Delim.c_str());
+ }
+ // move the search starting index
+ LastPos = Pos + 1;
+ }
+
+ // Add remainder if any
+ if (LastPos < Filter.size()) {
+ std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
+ Tokens.push_back(Tok);
+ } else if ((LastPos != 0) && ProhibitEmptyTokens) {
+ // if delimiter is the last sybmol in the string.
+ FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
+ "'%s' delimiter is not allowed.",
+ Delim.c_str());
+ }
+ return Tokens;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
new file mode 100644
index 0000000000000..51d6595560484
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -0,0 +1,285 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/zes_api.h>
+
+#include "L0Device.h"
+#include "L0Interop.h"
+#include "L0Kernel.h"
+#include "L0Plugin.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+using namespace llvm::omp::target;
+using namespace error;
+
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+// Common data across all possible plugin instantiations
+L0OptionsTy LevelZeroPluginTy::Options;
+
+int32_t LevelZeroPluginTy::findDevices() {
+ CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+ uint32_t NumDrivers = 0;
+ CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+ if (NumDrivers == 0) {
+ DP("Cannot find any drivers.\n");
+ return 0;
+ }
+ const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
+
+ // We expect multiple drivers on Windows to support different device types,
+ // so we need to maintain multiple drivers and contexts in general.
+ llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
+ CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+
+ struct RootInfoTy {
+ uint32_t OrderId;
+ ze_device_handle_t zeDevice;
+ L0ContextTy *Driver;
+ bool IsDiscrete;
+ };
+ llvm::SmallVector<RootInfoTy> RootDevices;
+
+ uint32_t OrderId = 0;
+ for (uint32_t DriverId = 0; DriverId < NumDrivers; DriverId++) {
+ const auto &Driver = FoundDrivers[DriverId];
+ uint32_t DeviceCount = 0;
+ ze_result_t RC;
+ CALL_ZE(RC, zeDeviceGet, Driver, &DeviceCount, nullptr);
+ if (RC != ZE_RESULT_SUCCESS || DeviceCount == 0) {
+ DP("Cannot find any devices from driver " DPxMOD ".\n", DPxPTR(Driver));
+ continue;
+ }
+ // We have a driver that supports at least one device
+ ContextList.emplace_back(*this, Driver, DriverId);
+ auto &DrvInfo = ContextList.back();
+ llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
+ CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+
+ for (auto &zeDevice : FoundDevices)
+ RootDevices.push_back(
+ {OrderId++, zeDevice, &DrvInfo, L0DeviceTy::isDiscrete(zeDevice)});
+ }
+
+ // move discrete devices to the front
+ std::sort(RootDevices.begin(), RootDevices.end(),
+ [](const RootInfoTy &A, const RootInfoTy &B) {
+ // if both are discrete, order by OrderId
+ // if both are not discrete, order by OrderId
+ // Otherwise, discrete goes first
+
+ if (A.IsDiscrete && B.IsDiscrete)
+ return A.OrderId < B.OrderId;
+ if (!A.IsDiscrete && !B.IsDiscrete)
+ return A.OrderId < B.OrderId;
+ return A.IsDiscrete;
+ });
+
+ struct DeviceInfoTy {
+ L0DeviceIdTy Id;
+ L0ContextTy *Driver;
+ bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+ };
+
+ llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
+
+ // helper lambdas
+ auto addDevice = [ExplicitMode,
+ &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+ int32_t SubId = -1, int32_t CCSId = -1) {
+ if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
+ DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
+ }
+ };
+ for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
+ const auto zeDevice = RootDevices[RootId].zeDevice;
+ auto *RootDriver = RootDevices[RootId].Driver;
+ addDevice(zeDevice, RootDriver, RootId);
+ }
+ NumDevices = DevicesToAdd.size();
+ auto DeviceId = 0;
+ for (auto &DeviceInfo : DevicesToAdd) {
+ auto RootId = DeviceInfo.Id.RootId;
+ auto SubId = DeviceInfo.Id.SubId;
+ auto CCSId = DeviceInfo.Id.CCSId;
+ auto zeDevice = DeviceInfo.Id.zeId;
+ auto *Driver = DeviceInfo.Driver;
+
+ std::string IdStr = std::to_string(RootId) +
+ (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+ (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+ L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
+ zeDevice, *Driver, std::move(IdStr),
+ CCSId < 0 ? 0 : CCSId /* ComputeIndex */
+ ));
+ DeviceId++;
+ }
+
+ DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
+ getNumRootDevices(), NumDevices);
+ DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
+ for (auto &l0Device : L0Devices) {
+ DP("-- %s\n", l0Device->getZeIdCStr());
+ (void)l0Device; // silence warning
+ }
+
+ if (getDebugLevel() > 0) {
+ DP("Root Device Information\n");
+ for (uint32_t I = 0; I < getNumRootDevices(); I++) {
+ auto &l0Device = getDeviceFromId(I);
+ l0Device.reportDeviceInfo();
+ }
+ }
+
+ return getNumRootDevices();
+}
+
+/// Clean-up routine to be invoked by the destructor or
+/// LevelZeroPluginTy::deinit.
+void LevelZeroPluginTy::closeRTL() {
+
+ ContextTLSTable.clear();
+ DeviceTLSTable.clear();
+ ThreadTLSTable.clear();
+ ContextList.clear();
+
+ DP("Plugin closed successfully\n");
+}
+
+Expected<int32_t> LevelZeroPluginTy::initImpl() {
+ DP("Level0 NG plugin initialization\n");
+ // process options before anything else
+ Options.init();
+ return findDevices();
+}
+
+Error LevelZeroPluginTy::deinitImpl() {
+ DP("Deinit Level0 plugin!\n");
+ closeRTL();
+ return Plugin::success();
+}
+
+GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
+ int32_t DeviceId,
+ int32_t NumDevices) {
+ return &getDeviceFromId(DeviceId);
+}
+
+GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
+ return new L0GlobalHandlerTy();
+}
+
+uint16_t LevelZeroPluginTy::getMagicElfBits() const {
+ // TODO: We need to register a real ELF machine type
+ return 0x8086;
+}
+
+Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
+ return Triple::spirv64;
+}
+
+const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
+
+Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
+ return Plugin::success();
+}
+
+Expected<bool> LevelZeroPluginTy::isELFCompatible(uint32_t DeviceId,
+ StringRef Image) const {
+ uint64_t MajorVer, MinorVer;
+ return isValidOneOmpImage(Image, MajorVer, MinorVer);
+}
+
+Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
+ if (!Interop) {
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+ DPxPTR(Interop));
+ }
+ if (!Interop->async_info || !Interop->async_info->Queue)
+ return Plugin::success();
+
+ // L0 object
+ const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+ const auto device_id = Interop->device_id;
+ auto &l0Device = getDeviceFromId(device_id);
+
+ // We can synchronize both L0 & SYCL objects with the same ze command
+ if (l0Device.useImmForInterop()) {
+ DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+ " with ImmCmdList barrier\n",
+ DPxPTR(Interop));
+ auto ImmCmdList = L0->ImmCmdList;
+ auto Event = l0Device.getEvent();
+
+ CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, Event, 0,
+ nullptr);
+ CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
+ l0Device.releaseEvent(Event);
+ } else {
+ DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+ " with queue synchronize\n",
+ DPxPTR(Interop));
+ auto CmdQueue = L0->CommandQueue;
+ CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+ }
+
+ return Plugin::success();
+}
+
+Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
+ if (!Interop) {
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+ DPxPTR(Interop));
+ }
+ if (!Interop->async_info || !Interop->async_info->Queue)
+ return Plugin::success();
+
+ const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+ const auto device_id = Interop->device_id;
+ if (Interop->attrs.inorder)
+ return Plugin::success();
+
+ auto &l0Device = getDeviceFromId(device_id);
+ if (l0Device.useImmForInterop()) {
+ DP("LevelZeroPluginTy::async_barrier: Appending ImmCmdList barrier "
+ "to " DPxMOD "\n",
+ DPxPTR(Interop));
+ auto ImmCmdList = L0->ImmCmdList;
+ CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
+ nullptr);
+ } else {
+ DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
+ "\n",
+ DPxPTR(Interop));
+ auto CmdQueue = L0->CommandQueue;
+ ze_command_list_handle_t CmdList = l0Device.getCmdList();
+ CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+ CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+ CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+ nullptr);
+ CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+ }
+
+ return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_level_zero() {
+ return new llvm::omp::target::plugin::LevelZeroPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
new file mode 100644
index 0000000000000..33c19b0e7c50d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -0,0 +1,625 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // !_WIN32
+
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+ DeviceImageTy &Image,
+ GlobalTy &DeviceGlobal) {
+ const char *GlobalName = DeviceGlobal.getName().data();
+
+ L0DeviceTy &l0Device = static_cast<L0DeviceTy &>(Device);
+ const L0ProgramTy *Program =
+ l0Device.getProgramFromImage(Image.getTgtImage());
+ void *Addr = Program->getOffloadVarDeviceAddr(GlobalName);
+
+ // Save the pointer to the symbol allowing nullptr.
+ DeviceGlobal.setPtr(Addr);
+
+ if (Addr == nullptr)
+ return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
+ GlobalName);
+
+ return Plugin::success();
+}
+
+inline L0DeviceTy &L0ProgramTy::getL0Device() const {
+ return L0DeviceTy::makeL0Device(getDevice());
+}
+
+L0ProgramTy::~L0ProgramTy() {
+ for (auto *Kernel : Kernels) {
+ // We need explicit destructor and deallocate calls to release the kernels
+ // created by `GenericDeviceTy::constructKernel()`.
+ Kernel->~L0KernelTy();
+ getL0Device().getPlugin().free(Kernel);
+ }
+ for (auto Module : Modules) {
+ CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+ }
+}
+
+void L0ProgramTy::setLibModule() {
+#if _WIN32
+ return;
+#else
+ const auto *Image = getTgtImage();
+ const size_t NumEntries =
+ static_cast<size_t>(Image->EntriesEnd - Image->EntriesBegin);
+ for (size_t I = 0; I < NumEntries; I++) {
+ const auto &Entry = Image->EntriesBegin[I];
+ // Image contains a kernel, so it is not compiled as a library module
+ if (Entry.SymbolName && Entry.Size == 0)
+ return;
+ }
+ // Check if the image belongs to a dynamic library
+ Dl_info DLI{nullptr};
+ if (dladdr(Image->ImageStart, &DLI) && DLI.dli_fname) {
+ std::vector<uint8_t> FileBin;
+ auto Size = readFile(DLI.dli_fname, FileBin);
+ if (Size) {
+ auto MB = MemoryBuffer::getMemBuffer(
+ StringRef(reinterpret_cast<const char *>(FileBin.data()), Size),
+ /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+ auto ELF = ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+ if (ELF) {
+ if (auto *Obj = dyn_cast<ELF64LEObjectFile>((*ELF).get())) {
+ const auto Header = Obj->getELFFile().getHeader();
+ if (Header.e_type == ELF::ET_DYN) {
+ DP("Processing current image as library\n");
+ IsLibModule = true;
+ }
+ }
+ }
+ }
+ }
+#endif // _WIN32
+}
+
+int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+ const std::string &CommonBuildOptions,
+ ze_module_format_t Format) {
+ const ze_module_constants_t SpecConstants =
+ LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
+ auto &l0Device = getL0Device();
+ std::string BuildOptions(CommonBuildOptions);
+
+ // Add required flag to enable dynamic linking.
+ if (IsLibModule)
+ BuildOptions += " -library-compilation ";
+
+ ze_module_desc_t ModuleDesc{};
+ ModuleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+ ModuleDesc.pNext = nullptr;
+ ModuleDesc.format = Format;
+ ze_module_handle_t Module = nullptr;
+ ze_module_build_log_handle_t BuildLog = nullptr;
+ ze_result_t RC;
+
+ // Build a single module from a single image
+ ModuleDesc.inputSize = Size;
+ ModuleDesc.pInputModule = Image;
+ ModuleDesc.pBuildFlags = BuildOptions.c_str();
+ ModuleDesc.pConstants = &SpecConstants;
+ CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
+ l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
+
+ const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
+
+ if (BuildFailed) {
+ if (IsLibModule)
+ return OFFLOAD_SUCCESS;
+ return OFFLOAD_FAIL;
+ } else {
+ // Check if module link is required. We do not need this check for
+ // library module
+ if (!RequiresModuleLink && !IsLibModule) {
+ ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+ nullptr, 0};
+ CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+ RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+ }
+ // For now, assume the first module contains libraries, globals.
+ if (Modules.empty())
+ GlobalModule = Module;
+ Modules.push_back(Module);
+ l0Device.addGlobalModule(Module);
+ return OFFLOAD_SUCCESS;
+ }
+}
+
+int32_t L0ProgramTy::linkModules() {
+ auto &l0Device = getL0Device();
+ if (!RequiresModuleLink) {
+ DP("Module link is not required\n");
+ return OFFLOAD_SUCCESS;
+ }
+
+ if (Modules.empty()) {
+ DP("Invalid number of modules when linking modules\n");
+ return OFFLOAD_FAIL;
+ }
+
+ ze_result_t RC;
+ ze_module_build_log_handle_t LinkLog = nullptr;
+ CALL_ZE_RC(RC, zeModuleDynamicLink,
+ static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+ l0Device.getGlobalModulesArray(), &LinkLog);
+ const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
+ return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+}
+
+size_t L0ProgramTy::readFile(const char *FileName,
+ std::vector<uint8_t> &OutFile) const {
+ std::ifstream IFS(FileName, std::ios::binary);
+ if (!IFS.good())
+ return 0;
+ IFS.seekg(0, IFS.end);
+ auto FileSize = static_cast<size_t>(IFS.tellg());
+ OutFile.resize(FileSize);
+ IFS.seekg(0);
+ if (!IFS.read(reinterpret_cast<char *>(OutFile.data()), FileSize)) {
+ OutFile.clear();
+ return 0;
+ }
+ return FileSize;
+}
+
+/// Read SPV from file name
+int32_t L0ProgramTy::readSPVFile(const char *FileName,
+ std::vector<uint8_t> &OutSPV) const {
+ // Resolve full path using the location of the plugin
+ std::string FullPath;
+#ifdef _WIN32
+ char RTLPath[_MAX_PATH];
+ HMODULE RTLModule = nullptr;
+ if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+ GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+ (LPCSTR)&__tgt_target_data_begin_nowait,
+ &RTLModule)) {
+ DP("Error: module creation failed -- cannot resolve full path\n");
+ return OFFLOAD_FAIL;
+ }
+ if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
+ DP("Error: module creation failed -- cannot resolve full path\n");
+ return OFFLOAD_FAIL;
+ }
+ FullPath = RTLPath;
+#else // _WIN32
+ Dl_info RTLInfo;
+ if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
+ DP("Error: module creation failed -- cannot resolve full path\n");
+ return OFFLOAD_FAIL;
+ }
+ FullPath = RTLInfo.dli_fname;
+#endif // _WIN32
+ const size_t PathSep = FullPath.find_last_of("/\\");
+ FullPath.replace(PathSep + 1, std::string::npos, FileName);
+ // Read from the full path
+ if (!readFile(FullPath.c_str(), OutSPV)) {
+ DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+ std::string &Options) const {
+ // Options that need to be replaced with backend-specific options
+ static const struct {
+ std::string Option;
+ std::string BackendOption;
+ } OptionTranslationTable[] = {
+ {"-ftarget-compile-fast",
+ "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"},
+ {"-foffload-fp32-prec-div", "-ze-fp32-correctly-rounded-divide-sqrt"},
+ {"-foffload-fp32-prec-sqrt", "-ze-fp32-correctly-rounded-divide-sqrt"},
+ };
+
+ for (const auto &OptPair : OptionTranslationTable) {
+ const size_t Pos = Options.find(OptPair.Option);
+ if (Pos != std::string::npos) {
+ Options.replace(Pos, OptPair.Option.length(), OptPair.BackendOption);
+ }
+ }
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+ uint64_t &MinorVer) {
+ const auto MB = MemoryBuffer::getMemBuffer(Image,
+ /*BufferName=*/"",
+ /*RequiresNullTerminator=*/false);
+ auto ExpectedNewE =
+ ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+ if (!ExpectedNewE) {
+ DP("Warning: unable to get ELF handle!\n");
+ return false;
+ }
+ bool Res = false;
+ auto processObjF = [&](const auto ELFObjF) {
+ if (!ELFObjF) {
+ DP("Warning: Unexpected ELF type!\n");
+ return false;
+ }
+ const auto &ELFF = ELFObjF->getELFFile();
+ auto Sections = ELFF.sections();
+ if (!Sections) {
+ DP("Warning: unable to get ELF sections!\n");
+ return false;
+ }
+ bool SeenOffloadSection = false;
+ for (auto Sec : *Sections) {
+ if (Sec.sh_type != ELF::SHT_NOTE)
+ continue;
+ Error Err = Plugin::success();
+ for (auto Note : ELFF.notes(Sec, Err)) {
+ if (Err) {
+ DP("Warning: unable to get ELF notes handle!\n");
+ return false;
+ }
+ if (Note.getName() != "INTELONEOMPOFFLOAD")
+ continue;
+ SeenOffloadSection = true;
+ if (Note.getType() != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+ continue;
+
+ std::string DescStr(std::move(Note.getDescAsStringRef(4).str()));
+ const auto DelimPos = DescStr.find('.');
+ if (DelimPos == std::string::npos) {
+ // The version has to look like "Major#.Minor#".
+ DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n",
+ DescStr.c_str());
+ return false;
+ }
+ const std::string MajorVerStr = DescStr.substr(0, DelimPos);
+ DescStr.erase(0, DelimPos + 1);
+ MajorVer = std::stoull(MajorVerStr);
+ MinorVer = std::stoull(DescStr);
+ return (MajorVer == 1 && MinorVer == 0);
+ }
+ }
+ return SeenOffloadSection;
+ };
+ if (const auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+ Res = processObjF(O);
+ } else if (const auto *O =
+ dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+ Res = processObjF(O);
+ } else {
+ assert(false && "Unexpected ELF format");
+ }
+ return Res;
+}
+
+static StringRef getImageStringRef(const __tgt_device_image *Image) {
+ const char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+ const char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+ const size_t ImgSize = ImgEnd - ImgBegin;
+ return StringRef(ImgBegin, ImgSize);
+}
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+ uint64_t &MinorVer) {
+ return isValidOneOmpImage(getImageStringRef(Image), MajorVer, MinorVer);
+}
+
+int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
+ auto &l0Device = getL0Device();
+ auto *Image = getTgtImage();
+ if (identify_magic(getImageStringRef(Image)) == file_magic::spirv_object) {
+ // Handle legacy plain SPIR-V image.
+ uint8_t *ImgBegin = reinterpret_cast<uint8_t *>(Image->ImageStart);
+ uint8_t *ImgEnd = reinterpret_cast<uint8_t *>(Image->ImageEnd);
+ size_t ImgSize = ImgEnd - ImgBegin;
+ return addModule(ImgSize, ImgBegin, BuildOptions,
+ ZE_MODULE_FORMAT_IL_SPIRV);
+ }
+
+ uint64_t MajorVer, MinorVer;
+ if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+ DP("Warning: image is not a valid oneAPI OpenMP image.\n");
+ return OFFLOAD_FAIL;
+ }
+
+ setLibModule();
+
+ // Iterate over the images and pick the first one that fits.
+ uint64_t ImageCount = 0;
+ struct V1ImageInfo {
+ // 0 - native, 1 - SPIR-V
+ uint64_t Format = (std::numeric_limits<uint64_t>::max)();
+ std::string CompileOpts;
+ std::string LinkOpts;
+ // We may have multiple sections created from split-kernel mode
+ std::vector<const uint8_t *> PartBegin;
+ std::vector<uint64_t> PartSize;
+
+ V1ImageInfo(uint64_t Format, std::string CompileOpts, std::string LinkOpts)
+ : Format(Format), CompileOpts(std::move(CompileOpts)),
+ LinkOpts(std::move(LinkOpts)) {}
+ };
+ std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+ auto MB = MemoryBuffer::getMemBuffer(getImageStringRef(Image),
+ /*BufferName=*/"",
+ /*RequiresNullTerminator=*/false);
+ auto ExpectedNewE =
+ ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+ assert(ExpectedNewE &&
+ "isValidOneOmpImage() returns true for invalid ELF image");
+ auto processELF = [&](auto *EObj) {
+ assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
+ assert(MajorVer == 1 && MinorVer == 0 &&
+ "FIXME: update image processing for new oneAPI OpenMP version.");
+ const auto &E = EObj->getELFFile();
+ // Collect auxiliary information.
+ uint64_t MaxImageIdx = 0;
+
+ auto Sections = E.sections();
+ assert(Sections && "isValidOneOmpImage() returns true for ELF image with "
+ "invalid sections.");
+
+ for (auto Sec : *Sections) {
+ if (Sec.sh_type != ELF::SHT_NOTE)
+ continue;
+ Error Err = Plugin::success();
+ for (auto Note : E.notes(Sec, Err)) {
+ assert(!Err && "isValidOneOmpImage() returns true for ELF image with "
+ "invalid notes.");
+ if (Note.getName().str() != "INTELONEOMPOFFLOAD")
+ continue;
+
+ const uint64_t Type = Note.getType();
+ std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+ switch (Type) {
+ default:
+ DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+ break;
+ case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+ break;
+ case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+ ImageCount = std::stoull(DescStr);
+ break;
+ case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
+ std::vector<std::string> Parts;
+ do {
+ const auto DelimPos = DescStr.find('\0');
+ if (DelimPos == std::string::npos) {
+ Parts.push_back(std::move(DescStr));
+ break;
+ }
+ Parts.push_back(DescStr.substr(0, DelimPos));
+ DescStr.erase(0, DelimPos + 1);
+ } while (Parts.size() < 4);
+
+ // Ignore records with less than 4 strings.
+ if (Parts.size() != 4) {
+ DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+ "record is ignored.\n");
+ continue;
+ }
+
+ const uint64_t Idx = std::stoull(Parts[0]);
+ MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+ if (AuxInfo.find(Idx) != AuxInfo.end()) {
+ DP("Warning: duplicate auxiliary information for image %" PRIu64
+ " is ignored.\n",
+ Idx);
+ continue;
+ }
+ AuxInfo.emplace(
+ std::piecewise_construct, std::forward_as_tuple(Idx),
+ std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+ // Image pointer and size
+ // will be initialized later.
+ }
+ }
+ }
+ }
+
+ if (MaxImageIdx >= ImageCount)
+ DP("Warning: invalid image index found in auxiliary information.\n");
+
+ for (auto Sec : *Sections) {
+ const char *Prefix = "__openmp_offload_spirv_";
+ auto ExpectedSectionName = E.getSectionName(Sec);
+ assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
+ "image with invalid section names");
+ std::string SectionName = (*ExpectedSectionName).str();
+ if (SectionName.find(Prefix) != 0)
+ continue;
+ SectionName.erase(0, std::strlen(Prefix));
+
+ // Expected section name in split-kernel mode:
+ // __openmp_offload_spirv_<image_id>_<part_id>
+ auto PartIdLoc = SectionName.find("_");
+ if (PartIdLoc != std::string::npos) {
+ DP("Found a split section in the image\n");
+ // It seems that we do not need part ID as long as they are ordered
+ // in the image and we keep the ordering in the runtime.
+ SectionName.erase(PartIdLoc);
+ } else {
+ DP("Found a single section in the image\n");
+ }
+
+ uint64_t Idx = std::stoull(SectionName);
+ if (Idx >= ImageCount) {
+ DP("Warning: ignoring image section (index %" PRIu64
+ " is out of range).\n",
+ Idx);
+ continue;
+ }
+
+ auto AuxInfoIt = AuxInfo.find(Idx);
+ if (AuxInfoIt == AuxInfo.end()) {
+ DP("Warning: ignoring image section (no aux info).\n");
+ continue;
+ }
+ auto Contents = E.getSectionContents(Sec);
+ assert(Contents);
+ AuxInfoIt->second.PartBegin.push_back((*Contents).data());
+ AuxInfoIt->second.PartSize.push_back(Sec.sh_size);
+ }
+ };
+
+ if (auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+ processELF(O);
+ } else if (auto *O = dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+ processELF(O);
+ } else {
+ assert(false && "Unexpected ELF format");
+ }
+
+ for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+ const auto It = AuxInfo.find(Idx);
+ if (It == AuxInfo.end()) {
+ DP("Warning: image %" PRIu64
+ " without auxiliary information is ingored.\n",
+ Idx);
+ continue;
+ }
+
+ const auto NumParts = It->second.PartBegin.size();
+ // Split-kernel is not supported in SPIRV format
+ if (NumParts > 1 && It->second.Format != 0) {
+ DP("Warning: split-kernel images are not supported in SPIRV format\n");
+ continue;
+ }
+
+ // Skip unknown image format
+ if (It->second.Format != 0 && It->second.Format != 1) {
+ DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+ continue;
+ }
+
+ const bool IsBinary = (It->second.Format == 0);
+ const auto ModuleFormat =
+ IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
+ std::string Options = BuildOptions;
+ {
+ Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
+ replaceDriverOptsWithBackendOpts(l0Device, Options);
+ }
+
+ for (size_t I = 0; I < NumParts; I++) {
+ const unsigned char *ImgBegin =
+ reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
+ size_t ImgSize = It->second.PartSize[I];
+
+ auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
+
+ if (RC != OFFLOAD_SUCCESS) {
+ DP("Error: failed to create program from %s "
+ "(%" PRIu64 "-%zu).\n",
+ IsBinary ? "Binary" : "SPIR-V", Idx, I);
+ return OFFLOAD_FAIL;
+ }
+ }
+
+ DP("Created module from image #%" PRIu64 ".\n", Idx);
+ BuildOptions = std::move(Options);
+
+ return OFFLOAD_SUCCESS;
+ }
+
+ return OFFLOAD_FAIL;
+}
+
+void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+ DP("Looking up OpenMP global variable '%s'.\n", CName);
+
+ if (!GlobalModule || !CName)
+ return nullptr;
+
+ std::string Name(CName);
+ size_t SizeDummy = 0;
+ void *DevicePtr = nullptr;
+ ze_result_t RC;
+ for (auto Module : Modules) {
+ CALL_ZE(RC, zeModuleGetGlobalPointer, Module, Name.c_str(), &SizeDummy,
+ &DevicePtr);
+ if (RC == ZE_RESULT_SUCCESS && DevicePtr)
+ return DevicePtr;
+ }
+ DP("Warning: global variable '%s' was not found in the device.\n",
+ Name.c_str());
+ return nullptr;
+}
+
+int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+ void *HostPtr) {
+ size_t SizeDummy = 0;
+ void *DevicePtr = nullptr;
+ ze_result_t RC;
+ CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+ &DevicePtr);
+ if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+ DP("Warning: cannot read from device global variable %s\n", Name);
+ return OFFLOAD_FAIL;
+ }
+ return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
+}
+
+int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+ const void *HostPtr) {
+ size_t SizeDummy = 0;
+ void *DevicePtr = nullptr;
+ ze_result_t RC;
+ CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+ &DevicePtr);
+ if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+ DP("Warning: cannot write to device global variable %s\n", Name);
+ return OFFLOAD_FAIL;
+ }
+ return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
+}
+
+int32_t L0ProgramTy::loadModuleKernels() {
+ // We need to build kernels here before filling the offload entries since we
+ // don't know which module contains a specific kernel with a name.
+ std::unordered_map<std::string, ze_kernel_handle_t> ModuleKernels;
+ for (auto Module : Modules) {
+ uint32_t Count = 0;
+ CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+ if (Count == 0)
+ continue;
+
+ llvm::SmallVector<const char *> Names(Count);
+ CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+
+ for (auto *Name : Names) {
+ KernelsToModuleMap.emplace(Name, Module);
+ }
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
new file mode 100644
index 0000000000000..3721d686393bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
@@ -0,0 +1,71 @@
+//===--- level_zero/src/OmpWrapper.cpp --------------------------- C++ -*-===//
+//
+// Implement wrapper for OpenMP compatibility through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "L0Defs.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(omp_get_max_teams, 0)
+DLWRAP_INTERNAL(omp_get_teams_thread_limit, 0)
+
+DLWRAP_FINALIZE()
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadOpenMP() {
+ static bool Loaded{false};
+ if (Loaded)
+ return true;
+
+ const char *OpenMPLibrary = "libomp.so";
+ std::string ErrMsg;
+
+ DP("Trying to load %s\n", OpenMPLibrary);
+ auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+ llvm::sys::DynamicLibrary::getPermanentLibrary(OpenMPLibrary, &ErrMsg));
+ if (!DynlibHandle->isValid()) {
+ if (ErrMsg.empty())
+ ErrMsg = "unknown error";
+ DP("Unable to load library '%s': %s!\n", OpenMPLibrary, ErrMsg.c_str());
+ return false;
+ }
+
+ for (size_t I = 0; I < dlwrap::size(); I++) {
+ const char *Sym = dlwrap::symbol(I);
+
+ void *P = DynlibHandle->getAddressOfSymbol(Sym);
+ if (P == nullptr) {
+ DP("Unable to find '%s' in '%s'!\n", Sym, OpenMPLibrary);
+ return false;
+ }
+ DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+ *dlwrap::pointer(I) = P;
+ }
+
+ return true;
+}
+
+int omp_get_max_teams() {
+ if (!loadOpenMP())
+ return 0;
+ return dlwrap_omp_get_max_teams();
+}
+
+int omp_get_teams_thread_limit() {
+ if (!loadOpenMP())
+ return 0;
+ return dlwrap_omp_get_teams_thread_limit();
+}
>From f8956cd31b8abdabe3714108489fd2795ffd6013 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:02:58 +0200
Subject: [PATCH 02/13] Update offload/CMakeLists.txt
Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
offload/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 8a704ab05eb53..3432ca3c29059 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -180,7 +180,7 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
message(STATUS "Not building Level Zero plugin: it is only supported on "
- "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+ "Linux/Windows x86_64 or ppc64le hosts")
list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
endif()
endif()
>From d3fc4d70f7e62cf5e3993e3c431c1430a8ff2d22 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:03:12 +0200
Subject: [PATCH 03/13] Update
offload/plugins-nextgen/level_zero/CMakeLists.txt
Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
offload/plugins-nextgen/level_zero/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index b9c8dd423c3ca..8e465d663655c 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -1,5 +1,5 @@
if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-return()
+ return()
endif()
# Create the library and add the default arguments.
>From 4b383862881d6201e44885da90f30c48312ab8dd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:09:41 +0200
Subject: [PATCH 04/13] Update
offload/plugins-nextgen/level_zero/include/L0Plugin.h
Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
offload/plugins-nextgen/level_zero/include/L0Plugin.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 4658c1cdab1df..de78ded59c2ce 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -45,7 +45,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
/// L0 plugin global options
static L0OptionsTy Options;
- /// Global mutex
std::mutex GlobalMutex;
/// Common pool of AsyncQueue
>From 6c1c820a923a3f017f4bc9cf054d9c6b39bb6f77 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 10:47:46 +0200
Subject: [PATCH 05/13] Replace pragma once
---
offload/plugins-nextgen/level_zero/include/AsyncQueue.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Context.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Defs.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Device.h | 4 +++-
offload/plugins-nextgen/level_zero/include/L0Interop.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Kernel.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Memory.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Options.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Plugin.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Program.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/L0Trace.h | 5 ++++-
offload/plugins-nextgen/level_zero/include/TLS.h | 5 ++++-
12 files changed, 47 insertions(+), 12 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 105f68205e402..e26661a613772 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
#include <vector>
@@ -48,3 +49,5 @@ typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
} // namespace target
} // namespace omp
} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index b2b6def8101ca..69748a3e61d01 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
#include "L0Memory.h"
#include "PerThreadTable.h"
@@ -136,3 +137,5 @@ class L0ContextTy {
};
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 81566f52a2aea..05c287f4da013 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -9,7 +9,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
#include "PluginInterface.h"
#include "Shared/Requirements.h"
@@ -71,3 +72,5 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
__func__);
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 6acfa7e0ee67d..e22cfd928c0af 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
#include "llvm/ADT/SmallVector.h"
@@ -678,3 +679,4 @@ class L0DeviceTy final : public GenericDeviceTy {
} // namespace target
} // namespace omp
} // namespace llvm
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
index 4b8b417f9b339..69a1a5f274068 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Interop.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
namespace llvm::omp::target::plugin::L0Interop {
@@ -23,3 +24,5 @@ struct Property {
};
} // namespace llvm::omp::target::plugin::L0Interop
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index bc6fc54cdea08..eca416d6fa882 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
#include "L0Defs.h"
#include "L0Trace.h"
@@ -152,3 +153,5 @@ class L0KernelTy : public GenericKernelTy {
};
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 50af80a19a93a..f5547201c994f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#include <cassert>
#include <level_zero/ze_api.h>
@@ -572,3 +573,5 @@ class StagingBufferTy {
};
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b3ecd25f56ddd..a501df693f311 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
#include <level_zero/ze_api.h>
@@ -187,3 +188,5 @@ struct L0OptionsTy {
}; // L0OptionsTy
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index de78ded59c2ce..9fbdafa288592 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
#include "AsyncQueue.h"
#include "L0Defs.h"
@@ -133,3 +134,5 @@ class LevelZeroPluginTy final : public GenericPluginTy {
};
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index a548b486f4642..d156cce268182 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
#include "L0Kernel.h"
@@ -133,3 +134,5 @@ bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
uint64_t &MinorVer);
} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index 2eeae81016dee..f8519bd44ae79 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
// clang-format off
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
#include "Shared/Debug.h"
#include "omptarget.h"
@@ -191,3 +192,5 @@ inline const char *getZeErrorName(int32_t Error) {
return "ZE_RESULT_ERROR_UNKNOWN";
}
}
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
index 8a5f41312e129..46086ee4b6d19 100644
--- a/offload/plugins-nextgen/level_zero/include/TLS.h
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -10,7 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
#include "AsyncQueue.h"
#include "L0Memory.h"
@@ -84,3 +85,5 @@ struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
} // namespace target
} // namespace omp
} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
>From fd91c47605a4ad06d6a5780d69e530995f4e2035 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:20:34 +0200
Subject: [PATCH 06/13] Address review comments
---
.../level_zero/include/AsyncQueue.h | 2 ++
.../level_zero/include/L0Defs.h | 3 +-
.../level_zero/include/L0Memory.h | 2 +-
.../level_zero/include/L0Options.h | 8 ++---
.../level_zero/src/L0Device.cpp | 6 ++--
.../level_zero/src/L0Kernel.cpp | 10 +++++-
.../level_zero/src/L0Options.cpp | 2 +-
.../level_zero/src/L0Program.cpp | 32 +++++++++----------
8 files changed, 37 insertions(+), 28 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index e26661a613772..2d32f1767a7b6 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -13,6 +13,8 @@
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+#include <list>
+#include <tuple>
#include <vector>
#include "L0Memory.h"
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 05c287f4da013..66d38cd7b9eb5 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -1,4 +1,5 @@
//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -43,7 +44,7 @@ LIBOMP_DECL(double, omp_get_wtime(void));
namespace llvm::omp::target::plugin {
/// Default alignmnet for allocation
-constexpr size_t L0Alignment = 0;
+constexpr size_t L0DefaultAlignment = 0;
/// Default staging buffer size for host to device copy (16KB)
constexpr size_t L0StagingBufferSize = (1 << 14);
/// Default staging buffer count
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index f5547201c994f..63115b1a3c529 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -506,7 +506,7 @@ class StagingBufferTy {
void *Ret = nullptr;
size_t AllocSize = Size * Count;
CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
- L0Alignment, &Ret);
+ L0DefaultAlignment, &Ret);
Buffers.push_back(Ret);
return Ret;
}
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index a501df693f311..7e64f71054569 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -141,9 +141,9 @@ struct L0OptionsTy {
bool Init = false; // have the options already been processed
- /// Read environment variables
L0OptionsTy() {}
+ /// Read environment variables
void processEnvironmentVars();
void init() {
@@ -155,9 +155,9 @@ struct L0OptionsTy {
/// Parse the string and split it into tokens of string_views based on the
/// Delim character.
- std::vector<std::string_view> tokenize(const std::string_view &Filter,
- const std::string &Delim,
- bool ProhibitEmptyTokens = false);
+ static std::vector<std::string_view>
+ tokenize(const std::string_view &Filter, const std::string &Delim,
+ bool ProhibitEmptyTokens = false);
bool isDigits(const std::string_view &str) {
if (str.size() == 0)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 0029d00a07685..2235741ea70a4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -64,15 +64,15 @@ constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0
DeviceArchTy L0DeviceTy::computeArch() const {
const auto PCIDeviceId = getPCIId();
if (PCIDeviceId != 0) {
- for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+ for (int ArchIndex = 0; ArchIndex < DeviceArchMapSize; ArchIndex++) {
for (int i = 0;; i++) {
- const auto Id = DeviceArchMap[arch].ids[i];
+ const auto Id = DeviceArchMap[ArchIndex].ids[i];
if (Id == PCIIdTy::None)
break;
auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
if (maskedId == Id)
- return DeviceArchMap[arch].arch; // Exact match or prefix match
+ return DeviceArchMap[ArchIndex].arch; // Exact match or prefix match
}
}
}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index d1cb0b7bd50bd..b0a13a07ab919 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,7 +163,9 @@ void L0KernelTy::decideKernelGroupArguments(
uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
bool UsedReductionSubscriptionRate = false;
if (!MaxGroupCountForced) {
- { GRPCounts[0] *= OptSubscRate; }
+ {
+ GRPCounts[0] *= OptSubscRate;
+ }
size_t LoopTripcount = 0;
if (LoopLevels) {
@@ -626,6 +628,12 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
}
} else {
ze_event_handle_t Event = nullptr;
+ if (AllowCooperative)
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+ zeKernel, &GroupCounts, Event, 0, nullptr);
+ else
+ CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ &GroupCounts, Event, 0, nullptr);
KernelLock.unlock();
CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 3acb2e78927e7..cb3a23b3e8bd4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -99,7 +99,7 @@ void L0OptionsTy::processEnvironmentVars() {
std::string_view TopDeviceStr = DeviceSubTuple[0];
static const std::array<std::string, 7> DeviceStr = {
- "host", "cpu", "gpu", "acc", "fpga", "*"};
+ "host", "cpu", "gpu", "acc", "*"};
auto It =
find_if(DeviceStr.begin(), DeviceStr.end(),
[&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 33c19b0e7c50d..9828f379e681a 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -132,22 +132,22 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
if (IsLibModule)
return OFFLOAD_SUCCESS;
return OFFLOAD_FAIL;
- } else {
- // Check if module link is required. We do not need this check for
- // library module
- if (!RequiresModuleLink && !IsLibModule) {
- ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
- nullptr, 0};
- CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
- RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
- }
- // For now, assume the first module contains libraries, globals.
- if (Modules.empty())
- GlobalModule = Module;
- Modules.push_back(Module);
- l0Device.addGlobalModule(Module);
- return OFFLOAD_SUCCESS;
}
+
+ // Check if module link is required. We do not need this check for
+ // library module
+ if (!RequiresModuleLink && !IsLibModule) {
+ ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+ nullptr, 0};
+ CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+ RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+ }
+ // For now, assume the first module contains libraries, globals.
+ if (Modules.empty())
+ GlobalModule = Module;
+ Modules.push_back(Module);
+ l0Device.addGlobalModule(Module);
+ return OFFLOAD_SUCCESS;
}
int32_t L0ProgramTy::linkModules() {
@@ -376,8 +376,6 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
"isValidOneOmpImage() returns true for invalid ELF image");
auto processELF = [&](auto *EObj) {
assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
- assert(MajorVer == 1 && MinorVer == 0 &&
- "FIXME: update image processing for new oneAPI OpenMP version.");
const auto &E = EObj->getELFFile();
// Collect auxiliary information.
uint64_t MaxImageIdx = 0;
>From 84665dc22f710e4401e65dea6d20e18128c94daa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:44:35 +0200
Subject: [PATCH 07/13] Fix makefile format
---
.../plugins-nextgen/level_zero/CMakeLists.txt | 92 ++++++++++---------
1 file changed, 48 insertions(+), 44 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 8e465d663655c..df38671c040ab 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -6,64 +6,68 @@ endif()
add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
set(LEVEL_ZERO_SRC_FILES
- src/L0Context.cpp
- src/L0Device.cpp
- src/L0Kernel.cpp
- src/L0Memory.cpp
- src/L0Program.cpp
- src/L0Plugin.cpp
- src/L0Program.cpp
- src/L0Options.cpp
+ src/L0Context.cpp
+ src/L0Device.cpp
+ src/L0Kernel.cpp
+ src/L0Memory.cpp
+ src/L0Program.cpp
+ src/L0Plugin.cpp
+ src/L0Program.cpp
+ src/L0Options.cpp
)
list(APPEND LEVEL_ZERO_SRC_FILES
- src/OmpWrapper.cpp
+ src/OmpWrapper.cpp
)
target_sources(omptarget.rtl.level_zero PRIVATE
- ${LEVEL_ZERO_SRC_FILES}
+ ${LEVEL_ZERO_SRC_FILES}
)
target_include_directories(omptarget.rtl.level_zero PRIVATE
- ${CMAKE_CURRENT_SOURCE_DIR}/include
- ${CMAKE_CURRENT_SOURCE_DIR}/src
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_include_directories(omptarget.rtl.level_zero PRIVATE
- ${LIBOMPTARGET_INCLUDE_DIR}
- ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
- ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
- ${LIBOMPTARGET_OMP_HEADER_DIR}
+ ${LIBOMPTARGET_INCLUDE_DIR}
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+ ${LIBOMPTARGET_OMP_HEADER_DIR}
)
if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
-message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+ message(STATUS "Building Level Zero NG plugin linked against level_zero library")
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- target_link_libraries(omptarget.rtl.level_zero PRIVATE
- ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
-elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
- # Full path to the L0 library is recognized as a linker option, so we
- # separate directory and file name
- get_filename_component(LEVEL_ZERO_LIBRARY_PATH
- ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
- get_filename_component(LEVEL_ZERO_LIBRARY_NAME
- ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
- target_link_libraries(omptarget.rtl.level_zero PRIVATE
- ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
- target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
- target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
- libomptarget_add_resource_file(omptarget.rtl.level_zero)
+ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ target_link_libraries(omptarget.rtl.level_zero PRIVATE
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+ elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+ # Full path to the L0 library is recognized as a linker option, so we
+ # separate directory and file name
+ get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+ get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+ target_link_libraries(omptarget.rtl.level_zero PRIVATE
+ ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+ target_link_directories(omptarget.rtl.level_zero PRIVATE
+ ${LEVEL_ZERO_LIBRARY_PATH})
+ target_link_options(omptarget.rtl.level_zero PRIVATE
+ "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+ libomptarget_add_resource_file(omptarget.rtl.level_zero)
+ else()
+ message(FATAL_ERROR "Missing platform support")
+ endif()
else()
- message(FATAL_ERROR "Missing platfrom support")
-endif()
-
-else()
-message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
-get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-if(CMAKE_SYSTEM_NAME MATCHES "Windows")
- # Windows uses dll instead of lib files at runtime
- string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
-endif()
-target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
-target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+ message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+ get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+ ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+ if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+ # Windows uses dll instead of lib files at runtime
+ string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME
+ ${LEVEL_ZERO_LIBRARY_NAME})
+ endif()
+ target_compile_options(omptarget.rtl.level_zero PRIVATE
+ "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+ target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
endif()
>From a2217dbd426065c7b0f831a66947738a636b0c74 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 14:56:40 +0200
Subject: [PATCH 08/13] change to StringRef in multiple places
---
.../level_zero/include/AsyncQueue.h | 2 +-
.../level_zero/include/L0Options.h | 29 ++-------
.../level_zero/src/L0Device.cpp | 7 ++-
.../level_zero/src/L0Program.cpp | 59 ++++++++++---------
4 files changed, 43 insertions(+), 54 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 2d32f1767a7b6..dfa8c54b1c124 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// Async Queue wrapper for SPIR-V/Xe machine
+// Async Queue wrapper for Level Zero
//
//===----------------------------------------------------------------------===//
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 7e64f71054569..ba62aa9ac0afa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -63,7 +63,6 @@ class SpecConstantsTy {
return Tmp;
}
};
-#define FIXED static constexpr
/// L0 Plugin flags
struct L0OptionFlagsTy {
@@ -94,7 +93,7 @@ struct L0OptionsTy {
std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
/// Oversubscription rate for normal kernels
- FIXED uint32_t SubscriptionRate = 4;
+ uint32_t SubscriptionRate = 4;
/// Loop kernels with known ND-range may be known to have
/// few iterations and they may not exploit the offload device
@@ -112,7 +111,7 @@ struct L0OptionsTy {
/// in the kernel should decrease.
/// Anyway, this is just a heuristics that seems to work well for some
/// kernels (which poorly expose parallelism in the first place).
- FIXED double ThinThreadsThreshold = 0.1;
+ double ThinThreadsThreshold = 0.1;
/// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
/// All the discard filter should be before the accept filter.
@@ -127,8 +126,8 @@ struct L0OptionsTy {
// option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
// builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
// builtins.
- std::string CompilationOptions = "-cl-std=CL2.0 ";
- std::string InternalCompilationOptions = "-cl-take-global-address";
+ static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
+ static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
std::string UserCompilationOptions = "";
// Spec constants used for all modules.
@@ -165,24 +164,8 @@ struct L0OptionsTy {
return std::all_of(str.begin(), str.end(), ::isdigit);
}
- bool match(const std::string &Var, const std::string &Matched) {
- if (Var.size() != Matched.size())
- return false;
-
- auto equals = [](char a, char b) {
- return std::tolower(a) == std::tolower(b);
- };
- return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
- equals);
- }
-
- bool match(const std::string &Var, const char *Matched) {
- std::string Str(Matched);
- return match(Var, Str);
- }
-
- bool match(const StringEnvar &Var, const char *Matched) {
- return match(Var.get(), Matched);
+ bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
+ return Matched.equals_insensitive(Var.get());
}
}; // L0OptionsTy
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 2235741ea70a4..1ef66751655d6 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -413,13 +413,14 @@ L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
(void)NumEntries; // silence warning
const auto &Options = getPlugin().getOptions();
- std::string CompilationOptions(Options.CompilationOptions + " " +
- Options.UserCompilationOptions);
+ std::string CompilationOptions(Options.CompilationOptions);
+ CompilationOptions += " " + Options.UserCompilationOptions;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
"Base L0 module compilation options: %s\n", CompilationOptions.c_str());
- CompilationOptions += " " + Options.InternalCompilationOptions;
+ CompilationOptions += " ";
+ CompilationOptions += Options.InternalCompilationOptions;
auto &Program = addProgram(ImageId, TgtImage);
int32_t RC = Program.buildModules(CompilationOptions);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 9828f379e681a..e7448757b9141 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -395,7 +395,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
continue;
const uint64_t Type = Note.getType();
- std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+ auto DescStrRef = Note.getDescAsStringRef(4);
switch (Type) {
default:
DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
@@ -403,19 +403,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
break;
case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
- ImageCount = std::stoull(DescStr);
+ if (!DescStrRef.getAsInteger(10, ImageCount)) {
+ DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
+ DescStrRef.str().c_str());
+ ImageCount = 0;
+ }
break;
- case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
- std::vector<std::string> Parts;
- do {
- const auto DelimPos = DescStr.find('\0');
- if (DelimPos == std::string::npos) {
- Parts.push_back(std::move(DescStr));
- break;
- }
- Parts.push_back(DescStr.substr(0, DelimPos));
- DescStr.erase(0, DelimPos + 1);
- } while (Parts.size() < 4);
+ case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX:
+ llvm::SmallVector<llvm::StringRef, 4> Parts;
+ DescStrRef.split(Parts, '\0', /* MaxSplit = */ 4,
+ /* KeepEmpty = */ false);
// Ignore records with less than 4 strings.
if (Parts.size() != 4) {
@@ -424,7 +421,8 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
continue;
}
- const uint64_t Idx = std::stoull(Parts[0]);
+ uint64_t Idx = 0;
+ Parts[0].getAsInteger(10, Idx);
MaxImageIdx = (std::max)(MaxImageIdx, Idx);
if (AuxInfo.find(Idx) != AuxInfo.end()) {
DP("Warning: duplicate auxiliary information for image %" PRIu64
@@ -432,13 +430,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
Idx);
continue;
}
+
+ uint64_t Part1Id;
+ Parts[1].getAsInteger(10, Part1Id);
+
AuxInfo.emplace(
std::piecewise_construct, std::forward_as_tuple(Idx),
- std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+ std::forward_as_tuple(Part1Id, Parts[2].str(), Parts[3].str()));
// Image pointer and size
// will be initialized later.
}
- }
}
}
@@ -450,24 +451,28 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
auto ExpectedSectionName = E.getSectionName(Sec);
assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
"image with invalid section names");
- std::string SectionName = (*ExpectedSectionName).str();
- if (SectionName.find(Prefix) != 0)
+ auto &SectionNameRef = *ExpectedSectionName;
+ if (!SectionNameRef.consume_front(Prefix))
continue;
- SectionName.erase(0, std::strlen(Prefix));
// Expected section name in split-kernel mode:
// __openmp_offload_spirv_<image_id>_<part_id>
- auto PartIdLoc = SectionName.find("_");
- if (PartIdLoc != std::string::npos) {
- DP("Found a split section in the image\n");
- // It seems that we do not need part ID as long as they are ordered
- // in the image and we keep the ordering in the runtime.
- SectionName.erase(PartIdLoc);
- } else {
+ auto Parts = SectionNameRef.split('_');
+ // It seems that we do not need part ID as long as they are ordered
+ // in the image and we keep the ordering in the runtime.
+ SectionNameRef = Parts.first;
+ if (Parts.second.empty()) {
DP("Found a single section in the image\n");
+ } else {
+ DP("Found a split section in the image\n");
}
- uint64_t Idx = std::stoull(SectionName);
+ uint64_t Idx = 0;
+ if (!SectionNameRef.getAsInteger(10, Idx)) {
+ DP("Warning: ignoring image section (invalid index '%s').\n",
+ SectionNameRef.str().c_str());
+ continue;
+ }
if (Idx >= ImageCount) {
DP("Warning: ignoring image section (index %" PRIu64
" is out of range).\n",
>From 08880a623e89a5eb7f9deebce9ced19e6d2b9e1a Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 09:49:34 +0200
Subject: [PATCH 09/13] remove tokenize
---
.../level_zero/include/L0Options.h | 12 --
.../level_zero/src/L0Options.cpp | 106 ++++++------------
2 files changed, 37 insertions(+), 81 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index ba62aa9ac0afa..b08a07f52fcc0 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -152,18 +152,6 @@ struct L0OptionsTy {
}
}
- /// Parse the string and split it into tokens of string_views based on the
- /// Delim character.
- static std::vector<std::string_view>
- tokenize(const std::string_view &Filter, const std::string &Delim,
- bool ProhibitEmptyTokens = false);
-
- bool isDigits(const std::string_view &str) {
- if (str.size() == 0)
- return false;
- return std::all_of(str.begin(), str.end(), ::isdigit);
- }
-
bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
return Matched.equals_insensitive(Var.get());
}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index cb3a23b3e8bd4..d0871c715b180 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -53,43 +53,51 @@ void L0OptionsTy::processEnvironmentVars() {
if (DeviceSelectorVar.isPresent()) {
std::string EnvStr(std::move(DeviceSelectorVar.get()));
uint32_t numDiscard = 0;
- std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
- [](unsigned char C) { return std::tolower(C); });
+ std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
- std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+ llvm::StringRef EnvRef(EnvStr);
+ llvm::SmallVector<llvm::StringRef> Entries;
+ EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
+ /* KeepEmpty = */ false);
for (const auto &Term : Entries) {
bool isDiscard = false;
- std::vector<std::string_view> Pair = tokenize(Term, ":", true);
- if (Pair.empty()) {
+
+ auto Parts = Term.split(':');
+ if (Parts.first.empty()) {
FAILURE_MESSAGE(
"Incomplete selector! Pair and device must be specified.\n");
- } else if (Pair.size() == 1) {
- FAILURE_MESSAGE("Incomplete selector! Try '%s:*'if all devices "
- "under the Pair was original intention.\n",
- Pair[0].data());
- } else if (Pair.size() > 2) {
+ }
+ if (Parts.second.empty()) {
+ FAILURE_MESSAGE(
+ "Incomplete selector! Pair and device must be specified.\n");
+ }
+ if (Parts.second.contains(':')) {
FAILURE_MESSAGE(
"Error parsing selector string \"%s\" Too many colons (:)\n",
Term.data());
}
- if (!((Pair[0][0] == '*') ||
- (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
- (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+
+ if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
+ Parts.first == "!level_zero"))
break;
- isDiscard = Pair[0][0] == '!';
+ isDiscard = Parts.first[0] == '!';
+
if (isDiscard)
numDiscard++;
else if (numDiscard > 0)
FAILURE_MESSAGE("All negative(discarding) filters must appear after "
"all positive(accepting) filters!");
- std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+ llvm::SmallVector<llvm::StringRef> Targets;
+ Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
+ /* KeepEmpty = */ false);
for (const auto &TargetStr : Targets) {
bool HasDeviceWildCard = false;
bool HasSubDeviceWildCard = false;
bool DeviceNum = false;
- std::vector<std::string_view> DeviceSubTuple =
- tokenize(TargetStr, ".", true);
+ llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
+ TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
+ /* KeepEmpty = */ false);
int32_t RootD[3] = {-1, -1, -1};
if (DeviceSubTuple.empty()) {
FAILURE_MESSAGE(
@@ -97,7 +105,7 @@ void L0OptionsTy::processEnvironmentVars() {
"specified.");
}
- std::string_view TopDeviceStr = DeviceSubTuple[0];
+ auto TopDeviceStr = DeviceSubTuple[0];
static const std::array<std::string, 7> DeviceStr = {
"host", "cpu", "gpu", "acc", "*"};
auto It =
@@ -107,15 +115,13 @@ void L0OptionsTy::processEnvironmentVars() {
if (TopDeviceStr[0] == '*') {
HasDeviceWildCard = true;
RootD[0] = -2;
- } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+ } else if (TopDeviceStr == "gpu")
continue;
} else {
- std::string TDS(TopDeviceStr);
- if (!isDigits(TDS)) {
+ if (TopDeviceStr.getAsInteger(10, RootD[0])) {
FAILURE_MESSAGE("error parsing device number: %s",
- DeviceSubTuple[0].data());
+ DeviceSubTuple[0].str().c_str());
} else {
- RootD[0] = std::stoi(TDS);
DeviceNum = true;
}
}
@@ -124,7 +130,7 @@ void L0OptionsTy::processEnvironmentVars() {
FAILURE_MESSAGE("sub-devices can only be requested when parent "
"device is specified by number or wildcard, not a "
"device type like \'gpu\'");
- std::string_view SubDeviceStr = DeviceSubTuple[1];
+ auto SubDeviceStr = DeviceSubTuple[1];
if (SubDeviceStr[0] == '*') {
HasSubDeviceWildCard = true;
RootD[1] = -2;
@@ -134,28 +140,24 @@ void L0OptionsTy::processEnvironmentVars() {
"sub-device can't be requested by number if parent "
"device is specified by a wildcard.");
- std::string SDS(SubDeviceStr);
- if (!isDigits(SDS)) {
+ if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
FAILURE_MESSAGE("error parsing subdevice index: %s",
- DeviceSubTuple[1].data());
- } else
- RootD[1] = std::stoi(SDS);
+ DeviceSubTuple[1].str().c_str());
+ }
}
}
if (DeviceSubTuple.size() == 3) {
- std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+ auto SubSubDeviceStr = DeviceSubTuple[2];
if (SubSubDeviceStr[0] == '*') {
RootD[2] = -2;
} else {
if (HasSubDeviceWildCard)
FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
"sub-device before is specified by a wildcard.");
- std::string SSDS(SubSubDeviceStr);
- if (!isDigits(SSDS)) {
+ if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
- DeviceSubTuple[2].data());
- } else
- RootD[2] = std::stoi(SSDS);
+ DeviceSubTuple[2].str().c_str());
+ }
}
} else if (DeviceSubTuple.size() > 3) {
FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
@@ -333,39 +335,5 @@ void L0OptionsTy::processEnvironmentVars() {
CommandModeVar.get().c_str());
}
}
-/// Parse String and split into tokens of string_views based on the
-/// Delim character.
-std::vector<std::string_view>
-L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
- bool ProhibitEmptyTokens) {
- std::vector<std::string_view> Tokens;
- size_t Pos = 0;
- size_t LastPos = 0;
- while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
- std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
-
- if (!Tok.empty()) {
- Tokens.push_back(Tok);
- } else if (ProhibitEmptyTokens) {
- FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
- "before '%s'delimiter is not allowed.",
- Delim.c_str());
- }
- // move the search starting index
- LastPos = Pos + 1;
- }
-
- // Add remainder if any
- if (LastPos < Filter.size()) {
- std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
- Tokens.push_back(Tok);
- } else if ((LastPos != 0) && ProhibitEmptyTokens) {
- // if delimiter is the last sybmol in the string.
- FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
- "'%s' delimiter is not allowed.",
- Delim.c_str());
- }
- return Tokens;
-}
} // namespace llvm::omp::target::plugin
>From 9a3088c49bc52ce643c63223608802eefc04b924 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 12:47:02 +0200
Subject: [PATCH 10/13] remove unused code
---
.../level_zero/include/L0Program.h | 1 -
.../level_zero/include/L0Trace.h | 7 ----
.../level_zero/src/L0Program.cpp | 38 -------------------
3 files changed, 46 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index d156cce268182..ca8b3b8a5cf52 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -60,7 +60,6 @@ class L0ProgramTy : public DeviceImageTy {
const std::string &BuildOption, ze_module_format_t Format);
/// Read file and return the size of the binary if successful.
size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
- int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
std::string &Options) const;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index f8519bd44ae79..0faa76171cbc9 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -27,13 +27,6 @@
DP(__VA_ARGS__); \
} while (0)
-#define FATAL_ERROR(Msg) \
- do { \
- fprintf(stderr, "%s --> ", DEBUG_PREFIX); \
- fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg); \
- exit(EXIT_FAILURE); \
- } while (0)
-
#define WARNING(...) \
do { \
fprintf(stderr, "%s --> ", DEBUG_PREFIX); \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index e7448757b9141..eb5da943d56c9 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -187,44 +187,6 @@ size_t L0ProgramTy::readFile(const char *FileName,
return FileSize;
}
-/// Read SPV from file name
-int32_t L0ProgramTy::readSPVFile(const char *FileName,
- std::vector<uint8_t> &OutSPV) const {
- // Resolve full path using the location of the plugin
- std::string FullPath;
-#ifdef _WIN32
- char RTLPath[_MAX_PATH];
- HMODULE RTLModule = nullptr;
- if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
- GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
- (LPCSTR)&__tgt_target_data_begin_nowait,
- &RTLModule)) {
- DP("Error: module creation failed -- cannot resolve full path\n");
- return OFFLOAD_FAIL;
- }
- if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
- DP("Error: module creation failed -- cannot resolve full path\n");
- return OFFLOAD_FAIL;
- }
- FullPath = RTLPath;
-#else // _WIN32
- Dl_info RTLInfo;
- if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
- DP("Error: module creation failed -- cannot resolve full path\n");
- return OFFLOAD_FAIL;
- }
- FullPath = RTLInfo.dli_fname;
-#endif // _WIN32
- const size_t PathSep = FullPath.find_last_of("/\\");
- FullPath.replace(PathSep + 1, std::string::npos, FileName);
- // Read from the full path
- if (!readFile(FullPath.c_str(), OutSPV)) {
- DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
- return OFFLOAD_FAIL;
- }
- return OFFLOAD_SUCCESS;
-}
-
void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
std::string &Options) const {
// Options that need to be replaced with backend-specific options
>From 24d06455603aaf372ce20a69b776ba237f1edaaa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 13:04:12 +0200
Subject: [PATCH 11/13] fix format
---
offload/plugins-nextgen/level_zero/include/L0Options.h | 3 ++-
offload/plugins-nextgen/level_zero/src/L0Kernel.cpp | 4 ++--
offload/plugins-nextgen/level_zero/src/L0Options.cpp | 8 ++++----
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b08a07f52fcc0..e383f070f10aa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -127,7 +127,8 @@ struct L0OptionsTy {
// builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
// builtins.
static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
- static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
+ static constexpr std::string_view InternalCompilationOptions =
+ "-cl-take-global-address";
std::string UserCompilationOptions = "";
// Spec constants used for all modules.
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b0a13a07ab919..538e627405b6d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,8 +163,8 @@ void L0KernelTy::decideKernelGroupArguments(
uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
bool UsedReductionSubscriptionRate = false;
if (!MaxGroupCountForced) {
- {
- GRPCounts[0] *= OptSubscRate;
+ {
+ GRPCounts[0] *= OptSubscRate;
}
size_t LoopTripcount = 0;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index d0871c715b180..1e0baa3f2b089 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -78,7 +78,7 @@ void L0OptionsTy::processEnvironmentVars() {
}
if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
- Parts.first == "!level_zero"))
+ Parts.first == "!level_zero"))
break;
isDiscard = Parts.first[0] == '!';
@@ -97,7 +97,7 @@ void L0OptionsTy::processEnvironmentVars() {
bool DeviceNum = false;
llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
- /* KeepEmpty = */ false);
+ /* KeepEmpty = */ false);
int32_t RootD[3] = {-1, -1, -1};
if (DeviceSubTuple.empty()) {
FAILURE_MESSAGE(
@@ -106,8 +106,8 @@ void L0OptionsTy::processEnvironmentVars() {
}
auto TopDeviceStr = DeviceSubTuple[0];
- static const std::array<std::string, 7> DeviceStr = {
- "host", "cpu", "gpu", "acc", "*"};
+ static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
+ "gpu", "acc", "*"};
auto It =
find_if(DeviceStr.begin(), DeviceStr.end(),
[&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
>From 0eb57125a5caff61ac4de16256965f4fad0ad120 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 16:31:25 +0200
Subject: [PATCH 12/13] Remove environment variable
---
.../level_zero/include/L0Options.h | 7 -
.../level_zero/src/L0Options.cpp | 162 ------------------
.../level_zero/src/L0Plugin.cpp | 10 +-
.../level_zero/src/L0Program.cpp | 5 +-
4 files changed, 4 insertions(+), 180 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index e383f070f10aa..8c79a82ef724b 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -113,13 +113,6 @@ struct L0OptionsTy {
/// kernels (which poorly expose parallelism in the first place).
double ThinThreadsThreshold = 0.1;
- /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
- /// All the discard filter should be before the accept filter.
- std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
-
- /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
- bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
-
// Compilation options for IGC
// OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
// runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 1e0baa3f2b089..7229e2498ae13 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -18,29 +18,6 @@
namespace llvm::omp::target::plugin {
-/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
-bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
- int32_t CCSID) const {
- if (ExplicitRootDevices.empty())
- return false;
- for (const auto &RootDev : ExplicitRootDevices) {
- const auto ErootID = std::get<1>(RootDev);
- if (ErootID != -2 && RootID != ErootID)
- continue;
- const auto EsubID = std::get<2>(RootDev);
- if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
- continue;
- const auto ECCSID = std::get<3>(RootDev);
- if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
- continue;
- // Check if isDiscard
- if (!std::get<0>(RootDev))
- return false;
- return true;
- }
- return false;
-}
-
/// Read environment variables
void L0OptionsTy::processEnvironmentVars() {
// Compilation options for IGC
@@ -48,145 +25,6 @@ void L0OptionsTy::processEnvironmentVars() {
std::string(" ") +
StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
- // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
- const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
- if (DeviceSelectorVar.isPresent()) {
- std::string EnvStr(std::move(DeviceSelectorVar.get()));
- uint32_t numDiscard = 0;
- std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
-
- llvm::StringRef EnvRef(EnvStr);
- llvm::SmallVector<llvm::StringRef> Entries;
- EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
- /* KeepEmpty = */ false);
- for (const auto &Term : Entries) {
- bool isDiscard = false;
-
- auto Parts = Term.split(':');
- if (Parts.first.empty()) {
- FAILURE_MESSAGE(
- "Incomplete selector! Pair and device must be specified.\n");
- }
- if (Parts.second.empty()) {
- FAILURE_MESSAGE(
- "Incomplete selector! Pair and device must be specified.\n");
- }
- if (Parts.second.contains(':')) {
- FAILURE_MESSAGE(
- "Error parsing selector string \"%s\" Too many colons (:)\n",
- Term.data());
- }
-
- if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
- Parts.first == "!level_zero"))
- break;
- isDiscard = Parts.first[0] == '!';
-
- if (isDiscard)
- numDiscard++;
- else if (numDiscard > 0)
- FAILURE_MESSAGE("All negative(discarding) filters must appear after "
- "all positive(accepting) filters!");
-
- llvm::SmallVector<llvm::StringRef> Targets;
- Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
- /* KeepEmpty = */ false);
- for (const auto &TargetStr : Targets) {
- bool HasDeviceWildCard = false;
- bool HasSubDeviceWildCard = false;
- bool DeviceNum = false;
- llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
- TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
- /* KeepEmpty = */ false);
- int32_t RootD[3] = {-1, -1, -1};
- if (DeviceSubTuple.empty()) {
- FAILURE_MESSAGE(
- "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
- "specified.");
- }
-
- auto TopDeviceStr = DeviceSubTuple[0];
- static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
- "gpu", "acc", "*"};
- auto It =
- find_if(DeviceStr.begin(), DeviceStr.end(),
- [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
- if (It != DeviceStr.end()) {
- if (TopDeviceStr[0] == '*') {
- HasDeviceWildCard = true;
- RootD[0] = -2;
- } else if (TopDeviceStr == "gpu")
- continue;
- } else {
- if (TopDeviceStr.getAsInteger(10, RootD[0])) {
- FAILURE_MESSAGE("error parsing device number: %s",
- DeviceSubTuple[0].str().c_str());
- } else {
- DeviceNum = true;
- }
- }
- if (DeviceSubTuple.size() >= 2) {
- if (!DeviceNum && !HasDeviceWildCard)
- FAILURE_MESSAGE("sub-devices can only be requested when parent "
- "device is specified by number or wildcard, not a "
- "device type like \'gpu\'");
- auto SubDeviceStr = DeviceSubTuple[1];
- if (SubDeviceStr[0] == '*') {
- HasSubDeviceWildCard = true;
- RootD[1] = -2;
- } else {
- if (HasDeviceWildCard) // subdevice is a number and device is a *
- FAILURE_MESSAGE(
- "sub-device can't be requested by number if parent "
- "device is specified by a wildcard.");
-
- if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
- FAILURE_MESSAGE("error parsing subdevice index: %s",
- DeviceSubTuple[1].str().c_str());
- }
- }
- }
- if (DeviceSubTuple.size() == 3) {
- auto SubSubDeviceStr = DeviceSubTuple[2];
- if (SubSubDeviceStr[0] == '*') {
- RootD[2] = -2;
- } else {
- if (HasSubDeviceWildCard)
- FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
- "sub-device before is specified by a wildcard.");
- if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
- FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
- DeviceSubTuple[2].str().c_str());
- }
- }
- } else if (DeviceSubTuple.size() > 3) {
- FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
- "supported at this time ",
- TargetStr.data());
- }
- if (isDiscard)
- ExplicitRootDevices.insert(
- ExplicitRootDevices.begin(),
- std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
- RootD[1], RootD[2]));
- else
- ExplicitRootDevices.push_back(
- std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
- RootD[1], RootD[2]));
- }
- }
- }
-
- DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
- ExplicitRootDevices.size());
- DP(" (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
- "-1(ignore)\n");
- for (auto &T : ExplicitRootDevices) {
- DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
- std::get<2>(T), std::get<3>(T));
- (void)T; // silence warning
- }
-
// Memory pool
// LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
// <Option> := 0 | <PoolInfoList>
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 51d6595560484..d632d57ce3d5d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -35,7 +35,6 @@ int32_t LevelZeroPluginTy::findDevices() {
DP("Cannot find any drivers.\n");
return 0;
}
- const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
// We expect multiple drivers on Windows to support different device types,
// so we need to maintain multiple drivers and contexts in general.
@@ -93,13 +92,10 @@ int32_t LevelZeroPluginTy::findDevices() {
llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
- // helper lambdas
- auto addDevice = [ExplicitMode,
- &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+ // helper lambda
+ auto addDevice = [&DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
int32_t SubId = -1, int32_t CCSId = -1) {
- if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
- DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
- }
+ DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
};
for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
const auto zeDevice = RootDevices[RootId].zeDevice;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index eb5da943d56c9..8b31bf7e3a7ec 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -128,11 +128,8 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
- if (BuildFailed) {
- if (IsLibModule)
- return OFFLOAD_SUCCESS;
+ if (BuildFailed)
return OFFLOAD_FAIL;
- }
// Check if module link is required. We do not need this check for
// library module
>From f491f3dc412729f8e8d4288f7c12ecf659a14dcc Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 18:42:05 +0200
Subject: [PATCH 13/13] fix getAsInteger conditions
---
offload/plugins-nextgen/level_zero/src/L0Program.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 8b31bf7e3a7ec..68ef755b2a852 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -362,7 +362,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
break;
case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
- if (!DescStrRef.getAsInteger(10, ImageCount)) {
+ if (DescStrRef.getAsInteger(10, ImageCount)) {
DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
DescStrRef.str().c_str());
ImageCount = 0;
@@ -427,7 +427,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
}
uint64_t Idx = 0;
- if (!SectionNameRef.getAsInteger(10, Idx)) {
+ if (SectionNameRef.getAsInteger(10, Idx)) {
DP("Warning: ignoring image section (invalid index '%s').\n",
SectionNameRef.str().c_str());
continue;
More information about the llvm-commits
mailing list