[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)

Alex Duran via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 3 14:51:46 PST 2025


https://github.com/adurang updated https://github.com/llvm/llvm-project/pull/158900

>From 0c427647d9ce0de9506992dfb16074178bebcc19 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 11:46:48 +0200
Subject: [PATCH 01/70] [OFFLOAD] Add plugin with support for Intel Level Zero

---
 offload/CMakeLists.txt                        |   17 +-
 .../Modules/LibomptargetGetDependencies.cmake |   21 +
 offload/include/OpenMP/InteropAPI.h           |    7 +-
 offload/include/PerThreadTable.h              |  155 ++-
 .../plugins-nextgen/common/include/DLWrap.h   |   16 +
 .../plugins-nextgen/level_zero/CMakeLists.txt |   69 ++
 .../level_zero/include/AsyncQueue.h           |   50 +
 .../level_zero/include/L0Context.h            |  138 +++
 .../level_zero/include/L0Defs.h               |   73 ++
 .../level_zero/include/L0Device.h             |  680 +++++++++++
 .../level_zero/include/L0Interop.h            |   25 +
 .../level_zero/include/L0Kernel.h             |  154 +++
 .../level_zero/include/L0Memory.h             |  574 +++++++++
 .../level_zero/include/L0Options.h            |  189 +++
 .../level_zero/include/L0Plugin.h             |  136 +++
 .../level_zero/include/L0Program.h            |  135 +++
 .../level_zero/include/L0Trace.h              |  193 +++
 .../plugins-nextgen/level_zero/include/TLS.h  |   86 ++
 .../level_zero/src/L0Context.cpp              |   41 +
 .../level_zero/src/L0Device.cpp               | 1065 +++++++++++++++++
 .../level_zero/src/L0DynWrapper.cpp           |  134 +++
 .../level_zero/src/L0Kernel.cpp               |  649 ++++++++++
 .../level_zero/src/L0Memory.cpp               |  637 ++++++++++
 .../level_zero/src/L0Options.cpp              |  371 ++++++
 .../level_zero/src/L0Plugin.cpp               |  285 +++++
 .../level_zero/src/L0Program.cpp              |  625 ++++++++++
 .../level_zero/src/OmpWrapper.cpp             |   71 ++
 27 files changed, 6586 insertions(+), 10 deletions(-)
 create mode 100644 offload/plugins-nextgen/level_zero/CMakeLists.txt
 create mode 100644 offload/plugins-nextgen/level_zero/include/AsyncQueue.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Context.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Defs.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Device.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Interop.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Kernel.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Memory.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Options.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Plugin.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Program.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Trace.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/TLS.h
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Context.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Device.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Memory.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Options.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Program.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index b277380783500..8a704ab05eb53 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
   message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
 endif()
 
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
 set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
-    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
 
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
@@ -176,6 +176,19 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
   endif()
 endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
+        CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
+  if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building Level Zero plugin: it is only supported on "
+	           "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+  endif()
+endif()
+if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
+		NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+    message(STATUS "Not building Level Zero plugin: dependencies not found")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 2a8bdebf2c1dd..0af0ae1ecdbec 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -89,4 +89,25 @@ if(LIBOMPTARGET_AMDGPU_ARCH)
   endif()
 endif()
 
+################################################################################
+# Looking for Level0
+################################################################################
+message(STATUS "Looking for Level0 includes.")
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
+
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
+	set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+  message(STATUS "Could NOT find Level Zero. Missing includes.")
+else()
+  message(STATUS "Level Zero include DIR: ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}")
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
+  message(STATUS "Looking for Level Zero library.")
+  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
+  if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES)
+    message(STATUS "Could NOT find Level Zero. Missing library.")
+  else()
+	  message(STATUS "Level Zero library: ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES}")
+  endif()
+endif()
+
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..2553bfa930784 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -160,17 +160,12 @@ struct InteropTableEntry {
     Interops.push_back(obj);
   }
 
-  template <class ClearFuncTy> void clear(ClearFuncTy f) {
-    for (auto &Obj : Interops) {
-      f(Obj);
-    }
-  }
-
   /// vector interface
   int size() const { return Interops.size(); }
   iterator begin() { return Interops.begin(); }
   iterator end() { return Interops.end(); }
   iterator erase(iterator it) { return Interops.erase(it); }
+  void clear() { Interops.clear(); }
 };
 
 struct InteropTblTy
diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 45b196171b4c8..0241370953c67 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -16,6 +16,60 @@
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
+
+template <typename ObjectType> struct PerThread {
+  struct PerThreadData {
+    std::unique_ptr<ObjectType> ThEntry;
+  };
+
+  std::mutex Mtx;
+  std::list<std::shared_ptr<PerThreadData>> ThreadDataList;
+
+  // define default constructors, disable copy and move constructors
+  PerThread() = default;
+  PerThread(const PerThread &) = delete;
+  PerThread(PerThread &&) = delete;
+  PerThread &operator=(const PerThread &) = delete;
+  PerThread &operator=(PerThread &&) = delete;
+  ~PerThread() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ThreadDataList.clear();
+  }
+
+private:
+  PerThreadData &getThreadData() {
+    static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
+    if (!ThData) {
+      ThData = std::make_shared<PerThreadData>();
+      std::lock_guard<std::mutex> Lock(Mtx);
+      ThreadDataList.push_back(ThData);
+    }
+    return *ThData;
+  }
+
+protected:
+  ObjectType &getThreadEntry() {
+    auto &ThData = getThreadData();
+    if (ThData.ThEntry)
+      return *ThData.ThEntry;
+    ThData.ThEntry = std::make_unique<ObjectType>();
+    return *ThData.ThEntry;
+  }
+
+public:
+  ObjectType &get() { return getThreadEntry(); }
+
+  template <class F> void clear(F f) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto ThData : ThreadDataList) {
+      if (!ThData->ThEntry)
+        continue;
+      f(*ThData->ThEntry);
+    }
+    ThreadDataList.clear();
+  }
+};
 
 // Using an STL container (such as std::vector) indexed by thread ID has
 // too many race conditions issues so we store each thread entry into a
@@ -23,10 +77,32 @@
 // T is the container type used to store the objects, e.g., std::vector,
 // std::set, etc. by each thread. O is the type of the stored objects e.g.,
 // omp_interop_val_t *, ...
-
 template <typename ContainerType, typename ObjectType> struct PerThreadTable {
   using iterator = typename ContainerType::iterator;
 
+  template <typename, typename = std::void_t<>>
+  struct has_iterator : std::false_type {};
+  template <typename T>
+  struct has_iterator<T, std::void_t<typename T::iterator>> : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clear : std::false_type {};
+  template <typename T>
+  struct has_clear<T, std::void_t<decltype(std::declval<T>().clear())>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clearAll : std::false_type {};
+  template <typename T>
+  struct has_clearAll<T, std::void_t<decltype(std::declval<T>().clearAll(1))>>
+      : std::true_type {};
+
+  template <typename, typename = std::void_t<>>
+  struct is_associative : std::false_type {};
+  template <typename T>
+  struct is_associative<T, std::void_t<typename T::mapped_type>>
+      : std::true_type {};
+
   struct PerThreadData {
     size_t NElements = 0;
     std::unique_ptr<ContainerType> ThEntry;
@@ -71,6 +147,11 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     return ThData.NElements;
   }
 
+  void setNElements(size_t Size) {
+    auto &NElements = getThreadNElements();
+    NElements = Size;
+  }
+
 public:
   void add(ObjectType obj) {
     auto &Entry = getThreadEntry();
@@ -104,11 +185,81 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     for (auto ThData : ThreadDataList) {
       if (!ThData->ThEntry || ThData->NElements == 0)
         continue;
-      ThData->ThEntry->clear(f);
+      if constexpr (has_clearAll<ContainerType>::value) {
+        ThData->ThEntry->clearAll(f);
+      } else if constexpr (has_iterator<ContainerType>::value &&
+                           has_clear<ContainerType>::value) {
+        for (auto &Obj : *ThData->ThEntry) {
+          if constexpr (is_associative<ContainerType>::value) {
+            f(Obj.second);
+          } else {
+            f(Obj);
+          }
+        }
+        ThData->ThEntry->clear();
+      } else {
+        static_assert(true, "Container type not supported");
+      }
       ThData->NElements = 0;
     }
     ThreadDataList.clear();
   }
 };
 
+template <typename T, typename = std::void_t<>> struct ContainerValueType {
+  using type = typename T::value_type;
+};
+template <typename T>
+struct ContainerValueType<T, std::void_t<typename T::mapped_type>> {
+  using type = typename T::mapped_type;
+};
+
+template <typename ContainerType, size_t reserveSize = 0>
+struct PerThreadContainer
+    : public PerThreadTable<ContainerType,
+                            typename ContainerValueType<ContainerType>::type> {
+
+  // helpers
+  template <typename T, typename = std::void_t<>> struct indexType {
+    using type = typename T::size_type;
+  };
+  template <typename T> struct indexType<T, std::void_t<typename T::key_type>> {
+    using type = typename T::key_type;
+  };
+  template <typename T, typename = std::void_t<>>
+  struct has_resize : std::false_type {};
+  template <typename T>
+  struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(1))>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_reserve : std::false_type {};
+  template <typename T>
+  struct has_reserve<T, std::void_t<decltype(std::declval<T>().reserve(1))>>
+      : std::true_type {};
+
+  using IndexType = typename indexType<ContainerType>::type;
+  using ObjectType = typename ContainerValueType<ContainerType>::type;
+
+  // Get the object for the given index in the current thread
+  ObjectType &get(IndexType Index) {
+    auto &Entry = this->getThreadEntry();
+
+    // specialized code for vector-like containers
+    if constexpr (has_resize<ContainerType>::value) {
+      if (Index >= Entry.size()) {
+        if constexpr (has_reserve<ContainerType>::value && reserveSize > 0) {
+          if (Entry.capacity() < reserveSize)
+            Entry.reserve(reserveSize);
+        }
+        // If the index is out of bounds, try resize the container
+        Entry.resize(Index + 1);
+      }
+    }
+    ObjectType &Ret = Entry[Index];
+    this->setNElements(Entry.size());
+    return Ret;
+  }
+};
+
 #endif
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
index 8934e7e701021..95ce86e123cd3 100644
--- a/offload/plugins-nextgen/common/include/DLWrap.h
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -282,5 +282,21 @@ template <size_t Requested, size_t Required> constexpr void verboseAssert() {
     return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
                                           x9, x10);                            \
   }
+#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9,                  \
+                        typename T::template arg<10>::type x10,                \
+                        typename T::template arg<11>::type x11) {              \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9, x10, x11);                       \
+  }
 
 #endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000000..b9c8dd423c3ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -0,0 +1,69 @@
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+return()
+endif()
+
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
+
+set(LEVEL_ZERO_SRC_FILES
+        src/L0Context.cpp
+        src/L0Device.cpp
+        src/L0Kernel.cpp
+        src/L0Memory.cpp
+        src/L0Program.cpp
+        src/L0Plugin.cpp
+        src/L0Program.cpp
+        src/L0Options.cpp
+)
+list(APPEND LEVEL_ZERO_SRC_FILES
+        src/OmpWrapper.cpp
+)
+
+target_sources(omptarget.rtl.level_zero PRIVATE
+   ${LEVEL_ZERO_SRC_FILES}
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/include
+      ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+      ${LIBOMPTARGET_OMP_HEADER_DIR}
+)
+
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  # Full path to the L0 library is recognized as a linker option, so we
+  # separate directory and file name
+  get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+  get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+  target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
+  target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+  libomptarget_add_resource_file(omptarget.rtl.level_zero)
+else()
+   message(FATAL_ERROR "Missing platfrom support")
+endif()
+
+else()
+message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # Windows uses dll instead of lib files at runtime
+   string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
+endif()
+target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+endif()
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
new file mode 100644
index 0000000000000..105f68205e402
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -0,0 +1,50 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Async Queue wrapper for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <vector>
+
+#include "L0Memory.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Abstract queue that supports asynchronous command submission
+struct AsyncQueueTy {
+  /// List of events attahced to submitted commands
+  std::vector<ze_event_handle_t> WaitEvents;
+  /// Pending staging buffer to host copies
+  std::list<std::tuple<void *, void *, size_t>> H2MList;
+  /// Pending USM memory copy commands that must wait for kernel completion
+  std::list<std::tuple<const void *, void *, size_t>> USM2MList;
+  /// Kernel event not signaled
+  ze_event_handle_t KernelEvent = nullptr;
+  /// Is this queue being used currently
+  bool InUse = false;
+  /// Clear data
+  void reset() {
+    WaitEvents.clear();
+    H2MList.clear();
+    USM2MList.clear();
+    KernelEvent = nullptr;
+  }
+};
+
+typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
new file mode 100644
index 0000000000000..b2b6def8101ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -0,0 +1,138 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Memory.h"
+#include "PerThreadTable.h"
+
+namespace llvm::omp::target::plugin {
+
+class LevelZeroPluginTy;
+
+class L0ContextTLSTy {
+  StagingBufferTy StagingBuffer;
+
+public:
+  auto &getStagingBuffer() { return StagingBuffer; }
+  const auto &getStagingBuffer() const { return StagingBuffer; }
+
+  void clear() { StagingBuffer.clear(); }
+};
+
+struct L0ContextTLSTableTy
+    : public PerThreadContainer<
+          std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
+  void clear() {
+    PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+/// Driver and context-specific resources. We assume a single context per
+/// driver.
+class L0ContextTy {
+  /// The plugin that created this context
+  LevelZeroPluginTy &Plugin;
+
+  /// Level Zero Driver handle
+  ze_driver_handle_t zeDriver = nullptr;
+
+  /// Common Level Zero context
+  ze_context_handle_t zeContext = nullptr;
+
+  /// API version supported by the Level Zero driver
+  ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT;
+
+  /// Imported external pointers. Track this only for user-directed
+  /// imports/releases.
+  std::unordered_map<uintptr_t, size_t> ImportedPtrs;
+
+  /// Common event pool
+  EventPoolTy EventPool;
+
+  /// Host Memory allocator for this driver
+  MemAllocatorTy HostMemAllocator;
+
+public:
+  /// Named constants for checking the imported external pointer regions.
+  static constexpr int32_t ImportNotExist = -1;
+  static constexpr int32_t ImportUnknown = 0;
+  static constexpr int32_t ImportExist = 1;
+
+  /// Create context, initialize event pool and extension functions
+  L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+              int32_t DriverId);
+
+  L0ContextTy(const L0ContextTy &) = delete;
+  L0ContextTy(L0ContextTy &&) = delete;
+  L0ContextTy &operator=(const L0ContextTy &) = delete;
+  L0ContextTy &operator=(const L0ContextTy &&) = delete;
+
+  /// Release resources
+  ~L0ContextTy() {
+    EventPool.deinit();
+    HostMemAllocator.deinit();
+    if (zeContext)
+      CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+  }
+
+  auto &getPlugin() const { return Plugin; }
+
+  StagingBufferTy &getStagingBuffer();
+
+  /// Add imported external pointer region.
+  void addImported(void *Ptr, size_t Size) {
+    (void)ImportedPtrs.emplace((uintptr_t)Ptr, Size);
+  }
+
+  /// Remove imported external pointer region
+  void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); }
+
+  /// Check if imported regions contain the specified region.
+  int32_t checkImported(void *Ptr, size_t Size) const {
+    uintptr_t LB = (uintptr_t)Ptr;
+    uintptr_t UB = LB + Size;
+    // We do not expect a large number of user-directed imports, so use simple
+    // logic.
+    for (auto &I : ImportedPtrs) {
+      uintptr_t ILB = I.first;
+      uintptr_t IUB = ILB + I.second;
+      if (LB >= ILB && UB <= IUB)
+        return ImportExist;
+      if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB))
+        return ImportUnknown;
+    }
+    return ImportNotExist;
+  }
+
+  ze_driver_handle_t getZeDriver() const { return zeDriver; }
+
+  /// Return context associated with the driver
+  ze_context_handle_t getZeContext() const { return zeContext; }
+
+  /// Return driver API version
+  ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
+
+  /// Return the event pool of this driver
+  auto &getEventPool() { return EventPool; }
+  const auto &getEventPool() const { return EventPool; }
+
+  bool supportsLargeMem() const {
+    // Large memory support is available since API version 1.1
+    return getDriverAPIVersion() >= ZE_API_VERSION_1_1;
+  }
+
+  const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
+  MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
new file mode 100644
index 0000000000000..81566f52a2aea
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -0,0 +1,73 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// External and other auxilary definitions
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "PluginInterface.h"
+#include "Shared/Requirements.h"
+#include "omptarget.h"
+
+#define LIBOMP_DECL(RetType, FnDecl) RetType __cdecl FnDecl
+
+enum class AllocOptionTy : int32_t {
+  ALLOC_OPT_NONE = 0,
+  ALLOC_OPT_REDUCTION_SCRATCH = 1,
+  ALLOC_OPT_REDUCTION_COUNTER = 2,
+  ALLOC_OPT_HOST_MEM = 3,
+  ALLOC_OPT_SLM = 4,
+};
+
+/// Host runtime routines being used
+extern "C" {
+LIBOMP_DECL(int, omp_get_max_teams(void));
+LIBOMP_DECL(int, omp_get_thread_limit(void));
+LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
+LIBOMP_DECL(double, omp_get_wtime(void));
+} // extern "C"
+
+#ifndef EXTRACT_BITS
+// MSB=63, LSB=0
+#define EXTRACT_BITS(I64, HIGH, LOW)                                           \
+  (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
+#endif
+
+namespace llvm::omp::target::plugin {
+
+/// Default alignmnet for allocation
+constexpr size_t L0Alignment = 0;
+/// Default staging buffer size for host to device copy (16KB)
+constexpr size_t L0StagingBufferSize = (1 << 14);
+/// Default staging buffer count
+constexpr size_t L0StagingBufferCount = 64;
+/// USM allocation threshold where preallocation does not pay off (128MB)
+constexpr size_t L0UsmPreAllocThreshold = (128 << 20);
+/// Host USM allocation threshold where preallocation does not pay off (8MB)
+constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20);
+
+using namespace error;
+/// Generic L0 handle type
+using ZeHandleTy = void *;
+
+template <typename... ArgsTy>
+static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+
+  if (Code == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  const char *Desc = "Unknown error";
+  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+                                                    ErrFmt, Args..., Desc);
+}
+
+#define L0_UNIMPLEMENTED_ERR                                                   \
+  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet\n",   \
+                       __func__);
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
new file mode 100644
index 0000000000000..6acfa7e0ee67d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -0,0 +1,680 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+  None            = 0x0000,
+  SKL             = 0x1900,
+  KBL             = 0x5900,
+  CFL             = 0x3E00,
+  CFL_2           = 0x9B00,
+  ICX             = 0x8A00,
+  TGL             = 0xFF20,
+  TGL_2           = 0x9A00,
+  DG1             = 0x4900,
+  RKL             = 0x4C00,
+  ADLS            = 0x4600,
+  RTL             = 0xA700,
+  MTL             = 0x7D00,
+  PVC             = 0x0B00,
+  DG2_ATS_M       = 0x4F00,
+  DG2_ATS_M_2     = 0x5600,
+  LNL             = 0x6400,
+  BMG             = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+  DeviceArch_None   = 0,
+  DeviceArch_Gen    = 0x0001, // Gen 9, Gen 11 or Xe
+  DeviceArch_XeLPG  = 0x0002,
+  DeviceArch_XeHPC  = 0x0004,
+  DeviceArch_XeHPG  = 0x0008,
+  DeviceArch_Xe2LP  = 0x0010,
+  DeviceArch_Xe2HP  = 0x0020,
+  DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+  ze_device_handle_t zeId;
+  int32_t RootId;
+  int32_t SubId;
+  int32_t CCSId;
+
+  L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+               int32_t CCSId = -1)
+      : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+  /// Command list for each device
+  ze_command_list_handle_t CmdList = nullptr;
+
+  /// Main copy command list for each device
+  ze_command_list_handle_t CopyCmdList = nullptr;
+
+  /// Link copy command list for each device
+  ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+  /// Command queue for each device
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  /// Main copy command queue for each device
+  ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+  /// Link copy command queues for each device
+  ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+  /// Immediate command list for each device
+  ze_command_list_handle_t ImmCmdList = nullptr;
+
+  /// Immediate copy command list for each device
+  ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+  L0DeviceTLSTy() = default;
+  ~L0DeviceTLSTy() {
+    // assert all fields are nullptr on destruction
+    assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+    assert(CopyCmdList == nullptr &&
+           "CopyCmdList is not nullptr on destruction");
+    assert(LinkCopyCmdList == nullptr &&
+           "LinkCopyCmdList is not nullptr on destruction");
+    assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+    assert(CopyCmdQueue == nullptr &&
+           "CopyCmdQueue is not nullptr on destruction");
+    assert(LinkCopyCmdQueue == nullptr &&
+           "LinkCopyCmdQueue is not nullptr on destruction");
+    assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+    assert(ImmCopyCmdList == nullptr &&
+           "ImmCopyCmdList is not nullptr on destruction");
+  }
+
+  L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+    CmdList = std::exchange(Other.CmdList, nullptr);
+    CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+    LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+    CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+    CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+    LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+    ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+    ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+  }
+
+  void clear() {
+    // destroy all lists and queues
+    if (CmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+    if (CopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+    if (LinkCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+    if (ImmCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+    if (ImmCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+    if (CmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+    if (CopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+    if (LinkCopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+    CmdList = nullptr;
+    CopyCmdList = nullptr;
+    LinkCopyCmdList = nullptr;
+    CmdQueue = nullptr;
+    CopyCmdQueue = nullptr;
+    LinkCopyCmdQueue = nullptr;
+    ImmCmdList = nullptr;
+    ImmCopyCmdList = nullptr;
+  }
+
+  L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+  auto getCmdList() const { return CmdList; }
+  void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+  auto getCopyCmdList() const { return CopyCmdList; }
+  void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+    CopyCmdList = _CopyCmdList;
+  }
+
+  auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+  void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+    LinkCopyCmdList = _LinkCopyCmdList;
+  }
+
+  auto getImmCmdList() const { return ImmCmdList; }
+  void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+    ImmCmdList = _ImmCmdList;
+  }
+
+  auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+  void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+    ImmCopyCmdList = _ImmCopyCmdList;
+  }
+
+  auto getCmdQueue() const { return CmdQueue; }
+  void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+    CmdQueue = _CmdQueue;
+  }
+
+  auto getCopyCmdQueue() const { return CopyCmdQueue; }
+  void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+    CopyCmdQueue = _CopyCmdQueue;
+  }
+
+  auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+  void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+    LinkCopyCmdQueue = _LinkCopyCmdQueue;
+  }
+};
+
+struct L0DeviceTLSTableTy
+    : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+  void clear() {
+    PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+  // Level Zero Context for this Device
+  L0ContextTy &l0Context;
+
+  // Level Zero handle  for this Device
+  ze_device_handle_t zeDevice;
+  // Device Properties
+  ze_device_properties_t DeviceProperties{};
+  ze_device_compute_properties_t ComputeProperties{};
+  ze_device_memory_properties_t MemoryProperties{};
+  ze_device_cache_properties_t CacheProperties{};
+
+  /// Devices' default target allocation kind for internal allocation
+  int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+  DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+  std::string DeviceName;
+
+  /// Common indirect access flags for this device
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+  /// Device UUID for toplevel devices only
+  std::string DeviceUuid;
+
+  /// L0 Device ID as string
+  std::string zeId;
+
+  /// Command queue group ordinals for each device
+  std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals for copying
+  std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals and number of queues for link copy engines
+  std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+  /// Command queue index for each device
+  uint32_t ComputeIndex = 0;
+
+  bool IsAsyncEnabled = false;
+
+  // lock for this device
+  std::mutex Mutex;
+
+  /// Contains all modules (possibly from multiple device images) to handle
+  /// dynamic link across multiple images
+  llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+  /// L0 programs created for this device
+  std::list<L0ProgramTy> Programs;
+
+  /// MemAllocator for this device
+  MemAllocatorTy MemAllocator;
+
+  /// The current size of the global device memory pool (managed by us).
+  uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+  int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+  int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfo);
+  int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                       __tgt_async_info *AsyncInfo);
+
+  bool shouldSetupDeviceMemoryPool() const override { return false; }
+  DeviceArchTy computeArch() const;
+
+  /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+  /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+  Error internalInit();
+
+public:
+  L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+             ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+             const std::string &zeId, int32_t ComputeIndex)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+        l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+        ComputeIndex(ComputeIndex) {
+    DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    DeviceProperties.pNext = nullptr;
+    ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    ComputeProperties.pNext = nullptr;
+    MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+    MemoryProperties.pNext = nullptr;
+    CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+    CacheProperties.pNext = nullptr;
+
+    auto Err = internalInit();
+    if (Err) {
+      FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+                    toString(std::move(Err)).c_str());
+    }
+  }
+
+  static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+    return static_cast<L0DeviceTy &>(Device);
+  }
+  static L0DeviceTy &makeL0Device(ompt_device_t *Device) {
+    return *static_cast<L0DeviceTy *>(Device);
+  }
+
+  auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+  L0DeviceTLSTy &getTLS();
+
+  Error setContext() override { return Plugin::success(); }
+  Error initImpl(GenericPluginTy &Plugin) override;
+  Error deinitImpl() override {
+    Programs.clear();
+    return Plugin::success();
+  }
+
+  auto getZeDevice() const { return zeDevice; }
+
+  const L0ContextTy &getL0Context() const { return l0Context; }
+  L0ContextTy &getL0Context() { return l0Context; }
+
+  const std::string &getName() const { return DeviceName; }
+  const char *getNameCStr() const { return DeviceName.c_str(); }
+
+  const std::string &getZeId() const { return zeId; }
+  const char *getZeIdCStr() const { return zeId.c_str(); }
+
+  std::mutex &getMutex() { return Mutex; }
+
+  auto getComputeIndex() const { return ComputeIndex; }
+  auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+  auto getNumGlobalModules() const { return GlobalModules.size(); }
+  void addGlobalModule(ze_module_handle_t Module) {
+    GlobalModules.push_back(Module);
+  }
+  auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+  L0ProgramTy *getProgramFromImage(const __tgt_device_image *Image) {
+    for (auto &PGM : Programs)
+      if (PGM.getTgtImage() == Image)
+        return &PGM;
+    return nullptr;
+  }
+
+  int32_t buildAllKernels() {
+    for (auto &PGM : Programs) {
+      int32_t RC = PGM.loadModuleKernels();
+      if (RC != OFFLOAD_SUCCESS)
+        return RC;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  // add a new program to the device. Return a reference to the new program
+  auto &addProgram(int32_t ImageId, const __tgt_device_image *Image) {
+    Programs.emplace_back(ImageId, *this, Image);
+    return Programs.back();
+  }
+
+  const auto &getLastProgram() const { return Programs.back(); }
+  auto &getLastProgram() { return Programs.back(); }
+  // Device properties getters
+  auto getVendorId() const { return DeviceProperties.vendorId; }
+  bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+  auto getPCIId() const { return DeviceProperties.deviceId; }
+  auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+  auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+  auto getNumEUsPerSubslice() const {
+    return DeviceProperties.numEUsPerSubslice;
+  }
+  auto getNumSubslicesPerSlice() const {
+    return DeviceProperties.numSubslicesPerSlice;
+  }
+  auto getNumSlices() const { return DeviceProperties.numSlices; }
+  auto getNumSubslices() const {
+    return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+  }
+  uint32_t getNumEUs() const {
+    return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+  }
+  auto getTotalThreads() const {
+    return DeviceProperties.numThreadsPerEU * getNumEUs();
+  }
+  auto getNumThreadsPerSubslice() const {
+    return getNumEUsPerSubslice() * getNumThreadsPerEU();
+  }
+  auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+  auto getMaxSharedLocalMemory() const {
+    return ComputeProperties.maxSharedLocalMemory;
+  }
+  auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+  auto getCacheSize() const { return CacheProperties.cacheSize; }
+
+  int32_t getAllocKind() const { return AllocKind; }
+  DeviceArchTy getDeviceArch() const { return DeviceArch; }
+  bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+  static bool isDiscrete(uint32_t PCIId) {
+    switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+    case PCIIdTy::BMG:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  static bool isDiscrete(ze_device_handle_t Device) {
+    ze_device_properties_t PR{};
+    PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    PR.pNext = nullptr;
+    CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+    return isDiscrete(PR.deviceId);
+  }
+
+  bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+  bool isDeviceIPorNewer(uint32_t Version) const;
+
+  const std::string &getUuid() const { return DeviceUuid; }
+
+  uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+  uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+  bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+  uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+  uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+  uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+  bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+  bool deviceRequiresImmCmdList() const {
+    return isDeviceIPorNewer(0x05004000);
+  }
+  bool asyncEnabled() const { return IsAsyncEnabled; }
+  bool useImmForCompute() const { return true; }
+  bool useImmForCopy() const { return true; }
+  bool useImmForInterop() const { return true; }
+  bool forceInorderInterop() const { return true; }
+
+  void reportDeviceInfo() const;
+
+  // Command queues related functions
+  /// Create a command list with given ordinal and flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         ze_command_list_flags_t Flags,
+                                         const std::string &DeviceIdStr);
+
+  /// Create a command list with default flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         const std::string &DeviceIdStr);
+
+  ze_command_list_handle_t getCmdList();
+
+  /// Create a command queue with given ordinal and flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           ze_command_queue_flags_t Flags,
+                                           const std::string &DeviceIdStr);
+
+  /// Create a command queue with default flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           const std::string &DeviceIdStr,
+                                           bool InOrder = false);
+
+  /// Create a new command queue for the given OpenMP device ID
+  ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+  /// Create an immediate command list
+  ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+                                            bool InOrder = false);
+
+  /// Create an immediate command list for computing
+  ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+    return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+  }
+
+  /// Create an immediate command list for copying
+  ze_command_list_handle_t createImmCopyCmdList();
+  ze_command_queue_handle_t getCmdQueue();
+  ze_command_list_handle_t getCopyCmdList();
+  ze_command_queue_handle_t getCopyCmdQueue();
+  ze_command_list_handle_t getLinkCopyCmdList();
+  ze_command_queue_handle_t getLinkCopyCmdQueue();
+  ze_command_list_handle_t getImmCmdList();
+  ze_command_list_handle_t getImmCopyCmdList();
+
+  /// Enqueue copy command
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                         __tgt_async_info *AsyncInfo = nullptr,
+                         bool Locked = false, bool UseCopyEngine = true);
+
+  /// Enqueue asynchronous copy command
+  int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                              __tgt_async_info *AsyncInfo, bool CopyTo = true);
+
+  /// Enqueue fill command
+  int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+                         size_t Size);
+
+  /// Driver related functions
+
+  /// Reurn the driver handle for this device
+  ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); }
+
+  /// Return context for this device
+  ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); }
+
+  /// Return driver API version for this device
+  ze_api_version_t getDriverAPIVersion() const {
+    return l0Context.getDriverAPIVersion();
+  }
+
+  /// Return an event from the driver associated to this device
+  ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+
+  /// Release event to the pool associated to this device
+  void releaseEvent(ze_event_handle_t Event) {
+    l0Context.getEventPool().releaseEvent(Event, *this);
+  }
+
+  StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
+
+  bool supportsLargeMem() const { return l0Context.supportsLargeMem(); }
+
+  // Allocation related routines
+
+  /// Data alloc
+  void *dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+                  bool UserAlloc, bool DevMalloc = false,
+                  uint32_t MemAdvice = UINT32_MAX,
+                  AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+
+  /// Data delete
+  int32_t dataDelete(void *Ptr);
+
+  /// Return the memory allocation type for the specified memory location.
+  uint32_t getMemAllocType(const void *Ptr) const;
+
+  const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; }
+  MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; }
+
+  MemAllocatorTy &getMemAllocator(int32_t Kind) {
+    if (Kind == TARGET_ALLOC_HOST)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  MemAllocatorTy &getMemAllocator(const void *Ptr) {
+    bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr));
+    if (IsHostMem)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  int32_t makeMemoryResident(void *Mem, size_t Size);
+
+  // Generic device interface implementation
+  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                           int32_t ImageId) override;
+  Error unloadBinaryImpl(DeviceImageTy *Image) override;
+  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+  int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+
+  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN,
+                         "dataLockImpl not supported");
+  }
+  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
+
+  Expected<bool> isPinnedPtrImpl(void *, void *&, void *&,
+                                 size_t &) const override {
+    // Don't need to do anything, this is handled by the driver.
+    return false;
+  }
+
+  Error dataFence(__tgt_async_info *Async) override;
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool ReleaseQueue) override;
+  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override;
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initDeviceInfoImpl(__tgt_device_info *Info) override;
+  Expected<bool>
+  hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override{
+      L0_UNIMPLEMENTED_ERR}
+
+  /* Event routines are used to ensure ordering between dataTransfers. Instead
+   * of adding extra events in the queues, we make sure they're ordered by
+   * using the events from the data submission APIs so we don't need to support
+   * these routines.
+   * They still need to report succes to indicate the event are handled
+   * somewhere waitEvent and syncEvent should remain unimplemented
+   */
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    return true;
+  }
+
+  Error createEventImpl(void **EventPtrStorage) override {
+    return Plugin::success();
+  }
+  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Error syncEventImpl(void *EventPtr) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Expected<InfoTreeNode> obtainInfoImpl() override;
+
+  Error getDeviceStackSize(uint64_t &V) override {
+    V = 0;
+    return Plugin::success();
+  }
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override;
+
+  Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); }
+  Error getDeviceHeapSize(uint64_t &V) override {
+    V = HeapSize;
+    return Plugin::success();
+  }
+  Error setDeviceHeapSize(uint64_t V) override {
+    HeapSize = V;
+    return Plugin::success();
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override;
+  Error releaseInterop(omp_interop_val_t *Interop) override;
+
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
new file mode 100644
index 0000000000000..4b8b417f9b339
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -0,0 +1,25 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interop support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace llvm::omp::target::plugin::L0Interop {
+
+/// Level Zero interop property
+struct Property {
+  // Use this when command queue needs to be accessed as
+  // the targetsync field in interop will be changed if preferred type is sycl.
+  ze_command_queue_handle_t CommandQueue;
+  ze_command_list_handle_t ImmCmdList;
+};
+
+} // namespace llvm::omp::target::plugin::L0Interop
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
new file mode 100644
index 0000000000000..bc6fc54cdea08
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -0,0 +1,154 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+#include "PluginInterface.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+class L0ProgramTy;
+
+/// Loop descriptor
+struct TgtLoopDescTy {
+  int64_t Lb = 0;     // The lower bound of the i-th loop
+  int64_t Ub = 0;     // The upper bound of the i-th loop
+  int64_t Stride = 0; // The stride of the i-th loop
+};
+
+struct TgtNDRangeDescTy {
+  int32_t NumLoops = 0;      // Number of loops/dimensions
+  int32_t DistributeDim = 0; // Dimensions lower than this one
+                             // must end up in one WG
+  TgtLoopDescTy Levels[3];   // Up to 3 loops
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+  uint32_t Width = 0;
+  uint32_t SIMDWidth = 0;
+  uint32_t MaxThreadGroupSize = 0;
+
+  /// Cached input parameters used in the previous launch
+  TgtNDRangeDescTy LoopDesc;
+  int32_t NumTeams = -1;
+  int32_t ThreadLimit = -1;
+
+  /// Cached parameters used in the previous launch
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX;
+  uint32_t GroupSizes[3] = {0, 0, 0};
+  ze_group_count_t GroupCounts{0, 0, 0};
+  bool AllowCooperative = false;
+
+  std::mutex Mtx;
+
+  static constexpr TgtNDRangeDescTy LoopDescInit = {};
+
+  /// Check if we can reuse group parameters.
+  bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        uint32_t *_GroupSizes, ze_group_count_t &_GroupCounts,
+                        bool &_AllowCooperative) const {
+    if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (_NumTeams != NumTeams || _ThreadLimit != ThreadLimit)
+      return false;
+    // Found matching input parameters.
+    std::copy_n(GroupSizes, 3, _GroupSizes);
+    _GroupCounts = GroupCounts;
+    _AllowCooperative = AllowCooperative;
+    return true;
+  }
+
+  /// Update cached group parameters.
+  void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        const uint32_t *_GroupSizes,
+                        const ze_group_count_t &_GroupCounts,
+                        const bool &_AllowCooperative) {
+    LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
+    NumTeams = _NumTeams;
+    ThreadLimit = _ThreadLimit;
+    std::copy_n(_GroupSizes, 3, GroupSizes);
+    GroupCounts = _GroupCounts;
+    AllowCooperative = _AllowCooperative;
+  }
+};
+
+class L0KernelTy : public GenericKernelTy {
+  // L0 Kernel Handle
+  ze_kernel_handle_t zeKernel;
+  // Kernel Properties
+  KernelPropertiesTy Properties;
+  auto &getProperties() { return Properties; }
+
+  int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+                              KernelLaunchParamsTy LaunchParams,
+                              __tgt_async_info *AsyncInfo) const;
+
+  void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
+                                  uint32_t ThreadLimit,
+                                  TgtNDRangeDescTy *LoopLevels,
+                                  uint32_t *GroupSizes,
+                                  ze_group_count_t &GroupCounts,
+                                  bool HalfNumThreads,
+                                  bool IsTeamsNDRange) const;
+
+  int32_t decideLoopKernelGroupArguments(
+      L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+      uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+      bool &AllowCooperative) const;
+
+  Error buildKernel(L0ProgramTy &Program);
+
+public:
+  /// Create a L0 kernel with a name and an execution mode.
+  L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
+  ~L0KernelTy() {
+    if (zeKernel)
+      CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
+  }
+  L0KernelTy(const L0KernelTy &) = delete;
+  L0KernelTy(L0KernelTy &&) = delete;
+  L0KernelTy &operator=(const L0KernelTy &) = delete;
+  L0KernelTy &operator=(const L0KernelTy &&) = delete;
+
+  const auto &getProperties() const { return Properties; }
+
+  /// Initialize the L0 kernel.
+  Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
+  /// Launch the L0 kernel function.
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
+                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   KernelLaunchParamsTy LaunchParams,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override{
+      L0_UNIMPLEMENTED_ERR}
+
+  ze_kernel_handle_t getZeKernel() const {
+    return zeKernel;
+  }
+
+  int32_t getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                         int32_t ThreadLimit, uint32_t *GroupSizes,
+                         ze_group_count_t &GroupCounts, void *LoopDesc,
+                         bool &AllowCooperative) const;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
new file mode 100644
index 0000000000000..50af80a19a93a
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -0,0 +1,574 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind)                                                \
+  (Kind == TARGET_ALLOC_HOST                                                   \
+       ? "host memory"                                                         \
+       : (Kind == TARGET_ALLOC_SHARED                                          \
+              ? "shared memory"                                                \
+              : (Kind == TARGET_ALLOC_DEVICE ? "device memory"                 \
+                                             : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+  /// Base address memory is allocated from
+  uintptr_t AllocBase = 0;
+  /// Minimal size served by the current heap
+  size_t BlockSize = 0;
+  /// Max size served by the current heap
+  size_t MaxSize = 0;
+  /// Available memory blocks
+  uint32_t NumBlocks = 0;
+  /// Number of block descriptors
+  uint32_t NumBlockDesc = 0;
+  /// Number of block counters
+  uint32_t NumBlockCounter = 0;
+  /// List of memory block descriptors
+  uint64_t *BlockDesc = nullptr;
+  /// List of memory block counters
+  uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+  /// Location of device memory blocks
+  void *PoolBase = nullptr;
+  /// Heap size common to all heaps
+  size_t HeapSize = 0;
+  /// Number of heaps available
+  uint32_t NumHeaps = 0;
+  /// Heap descriptors (using fixed-size array to simplify memory allocation)
+  DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+  /// Base address allocated from compute runtime
+  void *Base = nullptr;
+  /// Allocation size known to users/libomptarget
+  size_t Size = 0;
+  /// TARGET_ALLOC kind
+  int32_t Kind = TARGET_ALLOC_DEFAULT;
+  /// Allocation from pool?
+  bool InPool = false;
+  /// Is implicit argument
+  bool ImplicitArg = false;
+
+  MemAllocInfoTy() = default;
+
+  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+                 bool _ImplicitArg)
+      : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+        ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+  /// Simple memory allocation statistics. Maintains numbers for pool allocation
+  /// and GPU RT allocation.
+  struct MemStatTy {
+    size_t Requested[2] = {0, 0}; // Requested bytes
+    size_t Allocated[2] = {0, 0}; // Allocated bytes
+    size_t Freed[2] = {0, 0};     // Freed bytes
+    size_t InUse[2] = {0, 0};     // Current memory in use
+    size_t PeakUse[2] = {0, 0};   // Peak bytes used
+    size_t NumAllocs[2] = {0, 0}; // Number of allocations
+    MemStatTy() = default;
+  };
+
+  /// Memory pool which enables reuse of already allocated blocks
+  /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+  ///    memory.
+  /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+  /// -- Each memory block can allocate multiple fixed-size memory requested by
+  ///    offload RT or user.
+  /// -- Memory allocation falls back to GPU RT allocation when the pool size
+  ///    (total memory used by pool) reaches a threshold.
+  class MemPoolTy {
+
+    /// Memory block maintained in each bucket
+    struct BlockTy {
+      /// Base address of this block
+      uintptr_t Base = 0;
+      /// Size of the block
+      size_t Size = 0;
+      /// Supported allocation size by this block
+      size_t ChunkSize = 0;
+      /// Total number of slots
+      uint32_t NumSlots = 0;
+      /// Number of slots in use
+      uint32_t NumUsedSlots = 0;
+      /// Cached available slot returned by the last dealloc() call
+      uint32_t FreeSlot = UINT32_MAX;
+      /// Marker for the currently used slots
+      std::vector<bool> UsedSlots;
+
+      BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+        Base = reinterpret_cast<uintptr_t>(_Base);
+        Size = _Size;
+        ChunkSize = _ChunkSize;
+        NumSlots = Size / ChunkSize;
+        NumUsedSlots = 0;
+        UsedSlots.resize(NumSlots, false);
+      }
+
+      /// Check if the current block is fully used
+      bool isFull() const { return NumUsedSlots == NumSlots; }
+
+      /// Check if the given address belongs to the current block
+      bool contains(void *Mem) const {
+        auto M = reinterpret_cast<uintptr_t>(Mem);
+        return M >= Base && M < Base + Size;
+      }
+
+      /// Allocate a single chunk from the block
+      void *alloc();
+
+      /// Deallocate the given memory
+      void dealloc(void *Mem);
+    }; // BlockTy
+
+    /// Allocation kind for the current pool
+    int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+    /// Access to the allocator
+    MemAllocatorTy *Allocator = nullptr;
+    /// Minimum supported memory allocation size from pool
+    size_t AllocMin = 1 << 6; // 64B
+    /// Maximum supported memory allocation size from pool
+    size_t AllocMax = 0;
+    /// Allocation size when the pool needs to allocate a block
+    size_t AllocUnit = 1 << 16; // 64KB
+    /// Capacity of each block in the buckets which decides number of
+    /// allocatable chunks from the block. Each block in the bucket can serve
+    /// at least BlockCapacity chunks.
+    /// If ChunkSize * BlockCapacity <= AllocUnit
+    ///   BlockSize = AllocUnit
+    /// Otherwise,
+    ///   BlockSize = ChunkSize * BlockCapacity
+    /// This simply means how much memory is over-allocated.
+    uint32_t BlockCapacity = 0;
+    /// Total memory allocated from GPU RT for this pool
+    size_t PoolSize = 0;
+    /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+    /// when PoolSize reaches PoolSizeMax.
+    size_t PoolSizeMax = 0;
+    /// Small allocation size allowed in the pool even if pool size is over the
+    /// pool size limit
+    size_t SmallAllocMax = 1024;
+    /// Small allocation pool size
+    size_t SmallPoolSize = 0;
+    /// Small allocation pool size max (4MB)
+    size_t SmallPoolSizeMax = (4 << 20);
+    /// List of buckets
+    std::vector<std::vector<BlockTy *>> Buckets;
+    /// List of bucket parameters
+    std::vector<std::pair<size_t, size_t>> BucketParams;
+    /// Map from allocated pointer to corresponding block.
+    std::unordered_map<void *, BlockTy *> PtrToBlock;
+    /// Simple stats counting miss/hit in each bucket.
+    std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+    /// Need to zero-initialize after L0 allocation
+    bool ZeroInit = false;
+    /// Zero-initialized values to be copied to device
+    std::vector<char> ZeroInitValue;
+
+    /// Get bucket ID from the specified allocation size.
+    uint32_t getBucketId(size_t Size) {
+      uint32_t Count = 0;
+      for (size_t SZ = AllocMin; SZ < Size; Count++)
+        SZ <<= 1;
+      return Count;
+    }
+
+  public:
+    MemPoolTy() = default;
+
+    /// Construct pool with allocation kind, allocator, and user options.
+    MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+              const L0OptionsTy &Option);
+    // Used for reduction pool
+    MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    // Used for small memory pool with fixed parameters
+    MemPoolTy(MemAllocatorTy *_Allocator);
+
+    MemPoolTy(const MemPoolTy &) = delete;
+    MemPoolTy(MemPoolTy &&) = delete;
+    MemPoolTy &operator=(const MemPoolTy &) = delete;
+    MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+    void printUsage();
+    /// Release resources used in the pool.
+    ~MemPoolTy();
+
+    /// Allocate the requested size of memory from this pool.
+    /// AllocSize is the chunk size internally used for the returned memory.
+    void *alloc(size_t Size, size_t &AllocSize);
+    /// Deallocate the specified memory and returns block size deallocated.
+    size_t dealloc(void *Ptr);
+  }; // MemPoolTy
+
+  /// Allocation information maintained in the plugin
+  class MemAllocInfoMapTy {
+    /// Map from allocated pointer to allocation information
+    std::map<void *, MemAllocInfoTy> Map;
+    /// Map from target alloc kind to number of implicit arguments
+    std::map<int32_t, uint32_t> NumImplicitArgs;
+
+  public:
+    /// Add allocation information to the map
+    void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
+             bool InPool = false, bool ImplicitArg = false);
+
+    /// Remove allocation information for the given memory location
+    bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
+
+    /// Finds allocation information for the given memory location
+    const MemAllocInfoTy *find(void *Ptr) const {
+      auto AllocInfo = Map.find(Ptr);
+      if (AllocInfo == Map.end())
+        return nullptr;
+      else
+        return &AllocInfo->second;
+    }
+
+    /// Check if the map contains the given pointer and offset
+    bool contains(const void *Ptr, size_t Size) const {
+      if (Map.size() == 0)
+        return false;
+      auto I = Map.upper_bound(const_cast<void *>(Ptr));
+      if (I == Map.begin())
+        return false;
+      --I;
+      bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+                 (uintptr_t)Ptr + (uintptr_t)Size <=
+                     (uintptr_t)I->first + (uintptr_t)I->second.Size;
+      return Ret;
+    }
+
+    /// Returns the number of implicit arguments for the specified allocation
+    /// kind.
+    size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+  }; // MemAllocInfoMapTy
+
+  /// L0 context to use
+  const L0ContextTy *L0Context = nullptr;
+  /// L0 device to use
+  L0DeviceTy *Device = nullptr;
+  /// Whether the device supports large memory allocation
+  bool SupportsLargeMem = false;
+  /// Cached max alloc size supported by device
+  uint64_t MaxAllocSize = INT64_MAX;
+  /// Map from allocation kind to memory statistics
+  std::unordered_map<int32_t, MemStatTy> Stats;
+  /// Map from allocation kind to memory pool
+  std::unordered_map<int32_t, MemPoolTy> Pools;
+  /// Memory pool dedicated to reduction scratch space
+  std::unique_ptr<MemPoolTy> ReductionPool;
+  /// Memory pool dedicated to reduction counters
+  std::unique_ptr<MemPoolTy> CounterPool;
+  /// Allocation information map
+  MemAllocInfoMapTy AllocInfo;
+  /// RTL-owned memory that needs to be freed automatically
+  std::list<void *> MemOwned;
+  /// Lock protection
+  std::mutex Mtx;
+  /// Allocator only supports host memory
+  bool IsHostMem = false;
+  // Internal deallocation function to be called when already
+  // hondling the Mtx lock
+  int32_t dealloc_locked(void *Ptr);
+
+public:
+  MemAllocatorTy() = default;
+
+  MemAllocatorTy(const MemAllocatorTy &) = delete;
+  MemAllocatorTy(MemAllocatorTy &&) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
+
+  /// Release resources and report statistics if requested
+  ~MemAllocatorTy() {
+    if (L0Context)
+      deinit(); // Release resources
+  }
+  void deinit();
+
+  /// Allocator only supports host memory
+  bool supportsHostMem() { return IsHostMem; }
+
+  void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+  void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+  void updateMaxAllocSize(L0DeviceTy &L0Device);
+
+  /// Allocate memory from L0 GPU RT. We use over-allocation workaround
+  /// to support target pointer with offset, and positive "ActiveSize" is
+  /// specified in such cases for correct debug logging.
+  void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+
+  /// Allocate memory with the specified information from a memory pool
+  void *alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+              bool UserAlloc, bool DevMalloc, uint32_t MemAdvice,
+              AllocOptionTy AllocOpt);
+
+  /// Deallocate memory
+  int32_t dealloc(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return dealloc_locked(Ptr);
+  }
+
+  /// Check if the given memory location and offset belongs to any allocated
+  /// memory
+  bool contains(const void *Ptr, size_t Size) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.contains(Ptr, Size);
+  }
+
+  /// Get allocation information for the specified memory location
+  const MemAllocInfoTy *getAllocInfo(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.find(Ptr);
+  }
+
+  /// Get kernel indirect access flags using implicit argument info
+  ze_kernel_indirect_access_flags_t getIndirectFlags() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ze_kernel_indirect_access_flags_t Ret = 0;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+    return Ret;
+  }
+
+  /// Log memory allocation/deallocation
+  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+    if (Stats.count(Kind) == 0)
+      return; // Stat is disabled
+
+    auto &ST = Stats[Kind];
+    int32_t I = Pool ? 1 : 0;
+    if (ReqSize > 0) {
+      ST.Requested[I] += ReqSize;
+      ST.Allocated[I] += Size;
+      ST.InUse[I] += Size;
+      ST.NumAllocs[I]++;
+    } else {
+      ST.Freed[I] += Size;
+      ST.InUse[I] -= Size;
+    }
+    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+  }
+
+  /// Perform copy operation
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+}; /// MemAllocatorTy
+
+// simple generic wrapper to reuse objects
+// objects must have zero argument accessible constructor
+template <class ObjTy> class ObjPool {
+  // Protection
+  std::unique_ptr<std::mutex> Mtx;
+  // List of Objects
+  std::list<ObjTy *> Objects;
+
+public:
+  ObjPool() { Mtx.reset(new std::mutex); }
+
+  ObjPool(const ObjPool &) = delete;
+  ObjPool(ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &&) = delete;
+
+  ObjTy *get() {
+    if (!Objects.empty()) {
+      std::lock_guard<std::mutex> Lock(*Mtx);
+      if (!Objects.empty()) {
+        const auto Ret = Objects.back();
+        Objects.pop_back();
+        return Ret;
+      }
+    }
+    return new ObjTy();
+  }
+
+  void release(ObjTy *obj) {
+    std::lock_guard<std::mutex> Lock(*Mtx);
+    Objects.push_back(obj);
+  }
+
+  ~ObjPool() {
+    for (auto object : Objects)
+      delete object;
+  }
+};
+
+/// Common event pool used in the plugin. This event pool assumes all events
+/// from the pool are host-visible and use the same event pool flag.
+class EventPoolTy {
+  /// Size of L0 event pool created on demand
+  size_t PoolSize = 64;
+
+  /// Context of the events
+  ze_context_handle_t Context = nullptr;
+
+  /// Additional event pool flags common to this pull
+  uint32_t Flags = 0;
+
+  /// Protection
+  std::unique_ptr<std::mutex> Mtx;
+
+  /// List of created L0 event pools
+  std::list<ze_event_pool_handle_t> Pools;
+
+  /// List of free L0 events
+  std::list<ze_event_handle_t> Events;
+
+#ifdef OMPT_SUPPORT
+  /// Event to OMPT record map. The timestamp information is recorded to the
+  /// OMPT record before the event is recycled.
+  std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
+#endif // OMPT_SUPPORT
+
+public:
+  /// Initialize context, flags, and mutex
+  void init(ze_context_handle_t _Context, uint32_t _Flags) {
+    Context = _Context;
+    Flags = _Flags;
+    Mtx.reset(new std::mutex);
+  }
+
+  /// Destroys L0 resources
+  void deinit() {
+    for (auto E : Events)
+      CALL_ZE_RET_VOID(zeEventDestroy, E);
+    for (auto P : Pools)
+      CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+  }
+
+  /// Get a free event from the pool
+  ze_event_handle_t getEvent();
+
+  /// Return an event to the pool
+  void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+};
+
+/// Staging buffer
+/// A single staging buffer is not enough when batching is enabled since there
+/// can be multiple pending copy operations.
+class StagingBufferTy {
+  /// Context for L0 calls
+  ze_context_handle_t Context = nullptr;
+  /// Max allowed size for staging buffer
+  size_t Size = L0StagingBufferSize;
+  /// Number of buffers allocated together
+  size_t Count = L0StagingBufferCount;
+  /// Buffers increasing by Count if a new buffer is required
+  std::list<void *> Buffers;
+  /// Next buffer location in the buffers
+  size_t Offset = 0;
+
+  void *addBuffers() {
+    ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                       nullptr, 0};
+    void *Ret = nullptr;
+    size_t AllocSize = Size * Count;
+    CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+                     L0Alignment, &Ret);
+    Buffers.push_back(Ret);
+    return Ret;
+  }
+
+public:
+  StagingBufferTy() = default;
+  StagingBufferTy(const StagingBufferTy &) = delete;
+  StagingBufferTy(StagingBufferTy &&) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
+
+  ~StagingBufferTy() {
+    if (initialized())
+      clear();
+  }
+
+  void clear() {
+    ze_result_t Rc;
+    (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
+    for (auto Ptr : Buffers)
+      CALL_ZE(Rc, zeMemFree, Context, Ptr);
+    Context = nullptr;
+  }
+
+  bool initialized() const { return Context != nullptr; }
+
+  void init(ze_context_handle_t _Context, size_t _Size, size_t _Count) {
+    Context = _Context;
+    Size = _Size;
+    Count = _Count;
+  }
+
+  void reset() { Offset = 0; }
+
+  /// Always return the first buffer
+  void *get() {
+    if (Size == 0 || Count == 0)
+      return nullptr;
+    return Buffers.empty() ? addBuffers() : Buffers.front();
+  }
+
+  /// Return the next available buffer
+  void *getNext() {
+    void *Ret = nullptr;
+    if (Size == 0 || Count == 0)
+      return Ret;
+
+    size_t AllocSize = Size * Count;
+    bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
+    if (NeedToGrow)
+      Ret = addBuffers();
+    else
+      Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+
+    if (!Ret)
+      return nullptr;
+
+    Offset += Size;
+    return Ret;
+  }
+
+  /// Return either a fixed buffer or next buffer
+  void *get(bool Next) { return Next ? getNext() : get(); }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
new file mode 100644
index 0000000000000..b3ecd25f56ddd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+  std::vector<uint32_t> ConstantIds;
+  std::vector<const void *> ConstantValues;
+
+public:
+  SpecConstantsTy() = default;
+  SpecConstantsTy(const SpecConstantsTy &) = delete;
+  SpecConstantsTy(SpecConstantsTy &&) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+  SpecConstantsTy(const SpecConstantsTy &&Other)
+      : ConstantIds(std::move(Other.ConstantIds)),
+        ConstantValues(std::move(Other.ConstantValues)) {}
+
+  ~SpecConstantsTy() {
+    for (auto I : ConstantValues) {
+      const char *ValuePtr = reinterpret_cast<const char *>(I);
+      delete[] ValuePtr;
+    }
+  }
+
+  template <typename T> void addConstant(uint32_t Id, T Val) {
+    const size_t ValSize = sizeof(Val);
+    char *ValuePtr = new char[ValSize];
+    *reinterpret_cast<T *>(ValuePtr) = Val;
+
+    ConstantIds.push_back(Id);
+    ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+  }
+
+  ze_module_constants_t getModuleConstants() const {
+    ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+                              ConstantIds.data(),
+                              // Unfortunately we have to const_cast it.
+                              // L0 data type should probably be fixed.
+                              const_cast<const void **>(ConstantValues.data())};
+    return Tmp;
+  }
+};
+#define FIXED static constexpr
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+  uint64_t UseMemoryPool : 1;
+  uint64_t Reserved : 63;
+  L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+  /// Binary flags
+  L0OptionFlagsTy Flags;
+
+  /// Staging buffer size
+  size_t StagingBufferSize = L0StagingBufferSize;
+
+  /// Staging buffer count
+  size_t StagingBufferCount = L0StagingBufferCount;
+
+  // TODO: This should probably be an array indexed by AllocKind
+  /// Memory pool parameters
+  /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+  std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+      {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+      {TARGET_ALLOC_HOST, {1, 4, 256}},
+      {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+  /// Parameters for memory pools dedicated to reduction scratch space
+  std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+  /// Oversubscription rate for normal kernels
+  FIXED uint32_t SubscriptionRate = 4;
+
+  /// Loop kernels with known ND-range may be known to have
+  /// few iterations and they may not exploit the offload device
+  /// to the fullest extent.
+  /// Let's assume a device has N total HW threads available,
+  /// and the kernel requires M hardware threads with LWS set to L.
+  /// If (M < N * ThinThreadsThreshold), then we will try
+  /// to iteratively divide L by 2 to increase the number of HW
+  /// threads used for executing the kernel. Effectively, we will
+  /// end up with L less than the kernel's SIMD width, so the HW
+  /// threads will not use all their SIMD lanes. This (presumably) should
+  /// allow more parallelism, because the stalls in the SIMD lanes
+  /// will be distributed across more HW threads, and the probability
+  /// of having a stall (or a sequence of stalls) on a critical path
+  /// in the kernel should decrease.
+  /// Anyway, this is just a heuristics that seems to work well for some
+  /// kernels (which poorly expose parallelism in the first place).
+  FIXED double ThinThreadsThreshold = 0.1;
+
+  /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
+  /// All the discard filter should be before the accept filter.
+  std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
+
+  /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+  bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
+
+  // Compilation options for IGC
+  // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+  // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+  // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+  // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+  // builtins.
+  std::string CompilationOptions = "-cl-std=CL2.0 ";
+  std::string InternalCompilationOptions = "-cl-take-global-address";
+  std::string UserCompilationOptions = "";
+
+  // Spec constants used for all modules.
+  SpecConstantsTy CommonSpecConstants;
+
+  /// Command execution mode.
+  /// Whether the runtime uses asynchronous mode or not depends on the type of
+  /// devices and whether immediate command list is fully enabled.
+  CommandModeTy CommandMode = CommandModeTy::Async;
+
+  bool Init = false; // have the options already been processed
+
+  /// Read environment variables
+  L0OptionsTy() {}
+
+  void processEnvironmentVars();
+
+  void init() {
+    if (!Init) {
+      processEnvironmentVars();
+      Init = true;
+    }
+  }
+
+  /// Parse the string and split it into tokens of string_views based on the
+  /// Delim character.
+  std::vector<std::string_view> tokenize(const std::string_view &Filter,
+                                         const std::string &Delim,
+                                         bool ProhibitEmptyTokens = false);
+
+  bool isDigits(const std::string_view &str) {
+    if (str.size() == 0)
+      return false;
+    return std::all_of(str.begin(), str.end(), ::isdigit);
+  }
+
+  bool match(const std::string &Var, const std::string &Matched) {
+    if (Var.size() != Matched.size())
+      return false;
+
+    auto equals = [](char a, char b) {
+      return std::tolower(a) == std::tolower(b);
+    };
+    return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
+                      equals);
+  }
+
+  bool match(const std::string &Var, const char *Matched) {
+    std::string Str(Matched);
+    return match(Var, Str);
+  }
+
+  bool match(const StringEnvar &Var, const char *Matched) {
+    return match(Var.get(), Matched);
+  }
+
+}; // L0OptionsTy
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
new file mode 100644
index 0000000000000..4658c1cdab1df
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -0,0 +1,136 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Plugin interface for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Defs.h"
+#include "L0Device.h"
+#include "L0Memory.h"
+#include "L0Options.h"
+#include "L0Program.h"
+#include "TLS.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Class implementing the LevelZero specific functionalities of the plugin.
+class LevelZeroPluginTy final : public GenericPluginTy {
+private:
+  /// Number of devices available including subdevices
+  uint32_t NumDevices = 0;
+
+  /// Context (and Driver) specific data
+  std::list<L0ContextTy> ContextList;
+
+  /// L0 device used by each OpenMP device
+  using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
+  DeviceContainerTy L0Devices;
+
+  // Table containing per-thread information using TLS
+  L0ThreadTblTy ThreadTLSTable;
+  // Table containing per-thread information for each device using TLS
+  L0DeviceTLSTableTy DeviceTLSTable;
+  // Table containing per-thread information for each Context using TLS
+  L0ContextTLSTableTy ContextTLSTable;
+
+  /// L0 plugin global options
+  static L0OptionsTy Options;
+
+  /// Global mutex
+  std::mutex GlobalMutex;
+
+  /// Common pool of AsyncQueue
+  AsyncQueuePoolTy AsyncQueuePool;
+
+  auto &getTLS() { return ThreadTLSTable.get(); }
+
+public:
+  LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
+  virtual ~LevelZeroPluginTy() {}
+
+  auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
+  auto &getContextTLS(ze_context_handle_t Context) {
+    return ContextTLSTable.get(Context);
+  }
+
+  static const auto &getOptions() { return Options; }
+
+  auto &getGlobalMutex() { return GlobalMutex; }
+
+  struct DevicesRangeTy {
+    using iterator = DeviceContainerTy::iterator;
+
+    iterator BeginIt;
+    iterator EndIt;
+
+    DevicesRangeTy(iterator BeginIt, iterator EndIt)
+        : BeginIt(BeginIt), EndIt(EndIt) {}
+
+    auto &begin() { return BeginIt; }
+    auto &end() { return EndIt; }
+  };
+
+  auto getDevicesRange() {
+    return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
+  }
+
+  /// Clean-up routine to be invoked by the destructor or
+  /// LevelZeroPluginTy::deinit.
+  void closeRTL();
+
+  /// Find L0 devices and initialize device properties.
+  /// Returns number of devices reported to omptarget.
+  int32_t findDevices();
+
+  L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
+    assert("Invalid device ID" && DeviceId >= 0 &&
+           DeviceId < static_cast<int32_t>(L0Devices.size()));
+    return *L0Devices[DeviceId];
+  }
+
+  uint32_t getNumRootDevices() const { return NumDevices; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    auto *Queue = getTLS().getAsyncQueue();
+    if (!Queue)
+      Queue = AsyncQueuePool.get();
+    return Queue;
+  }
+
+  void releaseAsyncQueue(AsyncQueueTy *Queue) {
+    if (!Queue)
+      return;
+    Queue->reset();
+    Queue->InUse = false;
+    if (!getTLS().releaseAsyncQueue(Queue))
+      AsyncQueuePool.release(Queue);
+  }
+
+  // Plugin interface
+
+  Expected<int32_t> initImpl() override;
+  Error deinitImpl() override;
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override;
+  GenericGlobalHandlerTy *createGlobalHandler() override;
+  uint16_t getMagicElfBits() const override;
+  Triple::ArchType getTripleArch() const override;
+  const char *getName() const override;
+  Expected<bool> isELFCompatible(uint32_t DeviceId,
+                                 StringRef Image) const override;
+
+  Error flushQueueImpl(omp_interop_val_t *Interop) override;
+  Error syncBarrierImpl(omp_interop_val_t *Interop) override;
+  Error asyncBarrierImpl(omp_interop_val_t *Interop) override;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
new file mode 100644
index 0000000000000..a548b486f4642
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -0,0 +1,135 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Kernel.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+/// Program data to be initialized by plugin
+struct ProgramDataTy {
+  int Initialized = 0;
+  int NumDevices = 0;
+  int DeviceNum = -1;
+  uint32_t TotalEUs = 0;
+  uint32_t HWThreadsPerEU = 0;
+  uintptr_t DynamicMemoryLB = 0;
+  uintptr_t DynamicMemoryUB = 0;
+  int DeviceType = 0;
+  void *DynamicMemPool = nullptr;
+  int TeamsThreadLimit = 0;
+};
+
+/// Level Zero program that can contain multiple modules.
+class L0ProgramTy : public DeviceImageTy {
+  /// Handle multiple modules within a single target image
+  llvm::SmallVector<ze_module_handle_t> Modules;
+
+  /// Map of kernel names to Modules
+  std::unordered_map<std::string, ze_module_handle_t> KernelsToModuleMap;
+
+  /// List of kernels built for this image
+  /// We need to delete them ourselves as the main library is not doing
+  /// that right now
+  std::list<L0KernelTy *> Kernels;
+
+  /// Module that contains global data including device RTL
+  ze_module_handle_t GlobalModule = nullptr;
+
+  /// Requires module link
+  bool RequiresModuleLink = false;
+
+  /// Is this module library
+  bool IsLibModule = false;
+
+  /// Build a single module with the given image, build option, and format.
+  int32_t addModule(const size_t Size, const uint8_t *Image,
+                    const std::string &BuildOption, ze_module_format_t Format);
+  /// Read file and return the size of the binary if successful.
+  size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
+  int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
+  void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                        std::string &Options) const;
+
+  /// Check if the image should be handled as a library module
+  void setLibModule();
+
+  L0DeviceTy &getL0Device() const;
+
+public:
+  L0ProgramTy() = delete;
+
+  L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
+              const __tgt_device_image *Image)
+      : DeviceImageTy(ImageId, Device, Image) {}
+
+  ~L0ProgramTy();
+
+  L0ProgramTy(const L0ProgramTy &other) = delete;
+  L0ProgramTy(L0ProgramTy &&) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
+
+  static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
+    return static_cast<L0ProgramTy &>(Device);
+  }
+
+  /// Build modules from the target image description
+  int32_t buildModules(std::string &BuildOptions);
+
+  /// Link modules stored in \p Modules.
+  int32_t linkModules();
+
+  /// Loads the kernels names from all modules
+  int32_t loadModuleKernels();
+
+  /// Read data from the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+
+  /// Write data to the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t writeGlobalVariable(const char *Name, size_t Size,
+                              const void *HostPtr);
+
+  /// Looks up an OpenMP declare target global variable with the given
+  /// \p Name and \p Size in the device environment for the current device.
+  /// The lookup is first done via the device offload table. If it fails,
+  /// then the lookup falls back to non-OpenMP specific lookup on the device.
+  void *getOffloadVarDeviceAddr(const char *Name) const;
+
+  /// Returns the handle of a module that contains a given Kernel name
+  ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
+    auto K = KernelsToModuleMap.find(std::string(KernelName));
+    if (K == KernelsToModuleMap.end())
+      return nullptr;
+
+    return K->second;
+  }
+
+  void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); }
+};
+
+struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override;
+};
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
new file mode 100644
index 0000000000000..2eeae81016dee
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -0,0 +1,193 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing L0
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+#pragma once
+
+#include "Shared/Debug.h"
+#include "omptarget.h"
+#include <string>
+#include <level_zero/ze_api.h>
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define DPCALL(...)                                                            \
+  do {                                                                         \
+    if (getDebugLevel() > 1)                                                   \
+      DP(__VA_ARGS__);                                                         \
+  } while (0)
+
+#define FATAL_ERROR(Msg)                                                       \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
+    exit(EXIT_FAILURE);                                                        \
+  } while (0)
+
+#define WARNING(...)                                                           \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Warning: " __VA_ARGS__);                                  \
+  } while (0)
+
+#define INVALID_OPTION(Name, Value)                                            \
+  WARNING("Ignoring invalid option " #Name "=%s\n", Value)
+
+#define CALL_ZE(Rc, Fn, ...)                                                   \
+  do {                                                                         \
+      Rc = Fn(__VA_ARGS__);                                                    \
+  } while (0)
+
+#define CALL_ZE_RC(Rc, Fn, ...)                                                \
+  do {                                                                         \
+    CALL_ZE(Rc, Fn, __VA_ARGS__);                                              \
+    if (Rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getZeErrorName(Rc));                                                  \
+    }                                                                          \
+  } while(0)
+
+/// For non-thread-safe functions
+#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...)                                     \
+  do {                                                                         \
+    Mtx.lock();                                                                \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    Mtx.unlock();                                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+
+/// For thread-safe functions
+#define CALL_ZE_RET(Ret, Fn, ...)                                              \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR(Fn, ...)                                             \
+  CALL_ZE_RET(                                                                 \
+    Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",           \
+    STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
+
+
+
+#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...)                                     \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      const char *err_str = nullptr;                                           \
+      rc = zeDriverGetLastErrorDescription(                                    \
+          Dev.getDriverHandle(), &err_str);                                    \
+      fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn,          \
+              err_str);                                                        \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXIT_FAIL(Fn, ...)                                             \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...)                         \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__);                         \
+    if (rc != ZE_RESULT_SUCCESS)                                               \
+      return Ret;                                                              \
+  } while (0)
+
+
+#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...)                               \
+  CALL_ZE_EXT_SILENT_RET(Device,                                               \
+      Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s",          \
+			 STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__)                    
+
+#define FOREACH_ZE_ERROR_CODE(Fn)                                              \
+  Fn(ZE_RESULT_SUCCESS)                                                        \
+  Fn(ZE_RESULT_NOT_READY)                                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_LOST)                                              \
+  Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)                                       \
+  Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE)                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET)                                    \
+  Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE)                                \
+  Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)                                 \
+  Fn(ZE_RESULT_ERROR_NOT_AVAILABLE)                                            \
+  Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)                                   \
+  Fn(ZE_RESULT_WARNING_DROPPED_DATA)                                           \
+  Fn(ZE_RESULT_ERROR_UNINITIALIZED)                                            \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT)                                         \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)                                      \
+  Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_SIZE)                                             \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)                                         \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)                                  \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)                                 \
+  Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)                            \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)                                  \
+  Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)                                \
+  Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)                                      \
+  Fn(ZE_RESULT_WARNING_ACTION_REQUIRED)                                        \
+  Fn(ZE_RESULT_ERROR_UNKNOWN)
+
+#define CASE_TO_STRING(Num) case Num: return #Num;
+inline const char *getZeErrorName(int32_t Error) {
+  switch (Error) {
+    FOREACH_ZE_ERROR_CODE(CASE_TO_STRING)
+  default:
+    return "ZE_RESULT_ERROR_UNKNOWN";
+  }
+}
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
new file mode 100644
index 0000000000000..8a5f41312e129
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -0,0 +1,86 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thread Level Storage abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Memory.h"
+#include "L0Trace.h"
+#include "PerThreadTable.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// All thread-local data used by the Plugin
+class L0ThreadTLSTy {
+  /// Subdevice encoding
+  int64_t SubDeviceCode = 0;
+
+  /// Async info tracking
+  static constexpr int32_t PerThreadQueues = 10;
+  AsyncQueueTy AsyncQueues[PerThreadQueues];
+  int32_t UsedQueues = 0;
+
+public:
+  L0ThreadTLSTy() = default;
+  L0ThreadTLSTy(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy(L0ThreadTLSTy &&) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete;
+  ~L0ThreadTLSTy() {}
+
+  void clear() {}
+
+  int64_t getSubDeviceCode() { return SubDeviceCode; }
+
+  void setSubDeviceCode(int64_t Code) { SubDeviceCode = Code; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    AsyncQueueTy *ret = nullptr;
+    if (UsedQueues < PerThreadQueues) {
+      // there's a free queue in this thread, find it
+      for (int32_t q = 0; q < PerThreadQueues; q++) {
+        if (!AsyncQueues[q].InUse) {
+          UsedQueues++;
+          ret = &AsyncQueues[q];
+          break;
+        }
+      }
+      assert(ret && "A queue should have been found!");
+      ret->InUse = true;
+    }
+    return ret;
+  }
+
+  bool releaseAsyncQueue(AsyncQueueTy *queue) {
+    if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) {
+      // it's a local queue
+      queue->InUse = false;
+      UsedQueues--;
+      return true;
+    }
+    return false;
+  }
+};
+
+struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
+  void clear() {
+    PerThread::clear([](auto &Entry) { Entry.clear(); });
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
new file mode 100644
index 0000000000000..3f50ffd2a7260
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -0,0 +1,41 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Context.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+                         int32_t /*DriverId*/)
+    : Plugin(Plugin), zeDriver(zeDriver) {
+  CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+  DP("Driver API version is %" PRIx32 "\n", APIVersion);
+
+  ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+  CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
+
+  EventPool.init(zeContext, 0);
+  HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+}
+
+StagingBufferTy &L0ContextTy::getStagingBuffer() {
+  auto &TLS = Plugin.getContextTLS(getZeContext());
+  auto &Buffer = TLS.getStagingBuffer();
+  const auto &Options = Plugin.getOptions();
+  if (!Buffer.initialized())
+    Buffer.init(getZeContext(), Options.StagingBufferSize,
+                Options.StagingBufferCount);
+  return Buffer;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
new file mode 100644
index 0000000000000..0029d00a07685
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -0,0 +1,1065 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Device.h"
+#include "L0Defs.h"
+#include "L0Interop.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+L0DeviceTLSTy &L0DeviceTy::getTLS() {
+  return getPlugin().getDeviceTLS(getDeviceId());
+}
+
+// clang-format off
+/// Mapping from device arch to GPU runtime's device identifiers
+static struct {
+  DeviceArchTy arch;
+  PCIIdTy ids[10];
+} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::SKL,
+                       PCIIdTy::KBL,
+                       PCIIdTy::CFL, PCIIdTy::CFL_2,
+                       PCIIdTy::ICX,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::TGL, PCIIdTy::TGL_2,
+                       PCIIdTy::DG1,
+                       PCIIdTy::RKL,
+                       PCIIdTy::ADLS,
+                       PCIIdTy::RTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeLPG,
+                      {PCIIdTy::MTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPC,
+                      {PCIIdTy::PVC,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPG,
+                      {PCIIdTy::DG2_ATS_M,
+                       PCIIdTy::DG2_ATS_M_2,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2LP,
+                      {PCIIdTy::LNL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2HP,
+                      {PCIIdTy::BMG,
+                       PCIIdTy::None}},
+};
+constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]);
+// clang-format on
+
+DeviceArchTy L0DeviceTy::computeArch() const {
+  const auto PCIDeviceId = getPCIId();
+  if (PCIDeviceId != 0) {
+    for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+      for (int i = 0;; i++) {
+        const auto Id = DeviceArchMap[arch].ids[i];
+        if (Id == PCIIdTy::None)
+          break;
+
+        auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
+        if (maskedId == Id)
+          return DeviceArchMap[arch].arch; // Exact match or prefix match
+      }
+    }
+  }
+
+  DP("Warning: Cannot decide device arch for %s.\n", getNameCStr());
+  return DeviceArchTy::DeviceArch_None;
+}
+
+bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const {
+  ze_device_ip_version_ext_t IPVersion{};
+  IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+  IPVersion.pNext = nullptr;
+  ze_device_properties_t DevicePR{};
+  DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  DevicePR.pNext = &IPVersion;
+  CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR);
+  return IPVersion.ipVersion >= Version;
+}
+
+/// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findComputeOrdinal() {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+  for (uint32_t I = 0; I < Count; I++) {
+    // TODO: add a separate set of ordinals for compute queue groups which
+    // support cooperative kernels
+    if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      Ordinal.first = I;
+      Ordinal.second = Properties[I].numQueues;
+      break;
+    }
+  }
+  if (Ordinal.first == UINT32_MAX)
+    DP("Error: no command queues are found\n");
+
+  return Ordinal;
+}
+
+/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+
+  for (uint32_t I = 0; I < Count; I++) {
+    const auto &Flags = Properties[I].flags;
+    if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+        (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) {
+      auto NumQueues = Properties[I].numQueues;
+      if (LinkCopy && NumQueues > 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found link copy command queue for device " DPxMOD
+           ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n",
+           DPxPTR(zeDevice), Ordinal.first, Ordinal.second);
+        break;
+      } else if (!LinkCopy && NumQueues == 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32
+           "\n",
+           DPxPTR(zeDevice), Ordinal.first);
+        break;
+      }
+    }
+  }
+  return Ordinal;
+}
+
+void L0DeviceTy::reportDeviceInfo() const {
+  DP("Device %" PRIu32 "\n", DeviceId);
+  DP("-- Name                         : %s\n", getNameCStr());
+  DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
+  DP("-- UUID                         : %s\n", getUuid().c_str());
+  DP("-- Number of total EUs          : %" PRIu32 "\n", getNumEUs());
+  DP("-- Number of threads per EU     : %" PRIu32 "\n", getNumThreadsPerEU());
+  DP("-- EU SIMD width                : %" PRIu32 "\n", getSIMDWidth());
+  DP("-- Number of EUs per subslice   : %" PRIu32 "\n", getNumEUsPerSubslice());
+  DP("-- Number of subslices per slice: %" PRIu32 "\n",
+     getNumSubslicesPerSlice());
+  DP("-- Number of slices             : %" PRIu32 "\n", getNumSlices());
+  DP("-- Local memory size (bytes)    : %" PRIu32 "\n",
+     getMaxSharedLocalMemory());
+  DP("-- Global memory size (bytes)   : %" PRIu64 "\n", getGlobalMemorySize());
+  DP("-- Cache size (bytes)           : %" PRIu64 "\n", getCacheSize());
+  DP("-- Max clock frequency (MHz)    : %" PRIu32 "\n", getClockRate());
+}
+
+Error L0DeviceTy::internalInit() {
+  const auto &Options = getPlugin().getOptions();
+
+  uint32_t Count = 1;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count,
+                    &MemoryProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count,
+                    &CacheProperties);
+
+  DeviceName =
+      std::string(DeviceProperties.name, sizeof(DeviceProperties.name));
+
+  DP("Found a GPU device, Name = %s\n", DeviceProperties.name);
+
+  DeviceArch = computeArch();
+  // Default allocation kind for this device
+  AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED;
+
+  ze_kernel_indirect_access_flags_t Flags =
+      (AllocKind == TARGET_ALLOC_DEVICE)
+          ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
+          : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+  IndirectAccessFlags = Flags;
+
+  // Get the UUID
+  std::string uid = "";
+  for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++)
+    uid += std::to_string(DeviceProperties.uuid.id[n]);
+  DeviceUuid = std::move(uid);
+
+  ComputeOrdinal = findComputeOrdinal();
+
+  CopyOrdinal = findCopyOrdinal();
+
+  LinkCopyOrdinal = findCopyOrdinal(true);
+  IsAsyncEnabled =
+      isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
+  MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+  l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                bool ReleaseQueue) {
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (!IsAsync)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+
+  if (!AsyncQueue->WaitEvents.empty()) {
+    const auto &WaitEvents = AsyncQueue->WaitEvents;
+    if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
+      // Only need to wait for the last event
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+      // Synchronize on kernel event to support printf()
+      auto KE = AsyncQueue->KernelEvent;
+      if (KE && KE != WaitEvents.back()) {
+        CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+      }
+      for (auto &Event : WaitEvents) {
+        releaseEvent(Event);
+      }
+    } else { // Async
+      // Wait for all events. We should wait and reset events in reverse order
+      // to avoid premature event reset. If we have a kernel event in the
+      // queue, it is the last event to wait for since all wait events of the
+      // kernel are signaled before the kernel is invoked. We always invoke
+      // synchronization on kernel event to support printf().
+      bool WaitDone = false;
+      for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
+        if (!WaitDone) {
+          CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+          if (*Itr == AsyncQueue->KernelEvent)
+            WaitDone = true;
+        }
+        releaseEvent(*Itr);
+      }
+    }
+  }
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  if (ReleaseQueue) {
+    Plugin.releaseAsyncQueue(AsyncQueue);
+    getStagingBuffer().reset();
+    AsyncInfo->Queue = nullptr;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                               __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  const auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
+      TgtPtrType == ZE_MEMORY_TYPE_HOST) {
+    std::copy_n(static_cast<const char *>(HstPtr), Size,
+                static_cast<char *>(TgtPtr));
+  } else {
+    const void *SrcPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      SrcPtr = getStagingBuffer().get(IsAsync);
+      std::copy_n(static_cast<const char *>(HstPtr), Size,
+                  static_cast<char *>(const_cast<void *>(SrcPtr)));
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
+    else
+      RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
+       DPxPTR(TgtPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                                 __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = Plugin.getAsyncQueue();
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
+  auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
+      TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
+    bool CopyNow = true;
+    if (IsAsync) {
+      if (AsyncQueue->KernelEvent) {
+        // Delay Host/Shared USM to host memory copy since it must wait for
+        // kernel completion.
+        AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size);
+        CopyNow = false;
+      }
+    }
+    if (CopyNow) {
+      std::copy_n(static_cast<const char *>(TgtPtr), Size,
+                  static_cast<char *>(HstPtr));
+    }
+  } else {
+    void *DstPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <=
+            getPlugin().getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      DstPtr = getStagingBuffer().get(IsAsync);
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+                               /* CopyTo */ false);
+    else
+      RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+    if (DstPtr != HstPtr) {
+      if (IsAsync) {
+        // Store delayed H2M data copies
+        auto &H2MList = AsyncQueue->H2MList;
+        H2MList.emplace_back(DstPtr, HstPtr, static_cast<size_t>(Size));
+      } else {
+        std::copy_n(static_cast<char *>(DstPtr), Size,
+                    static_cast<char *>(HstPtr));
+      }
+    }
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
+       DPxPTR(HstPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
+                           int32_t ImageId) {
+  auto *PGM = getProgramFromImage(TgtImage);
+  if (PGM) {
+    // Program already exists
+    return PGM;
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+       DPxPTR(TgtImage->ImageStart));
+
+  const size_t NumEntries =
+      (size_t)(TgtImage->EntriesEnd - TgtImage->EntriesBegin);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Expecting to have %zu entries defined\n", NumEntries);
+  (void)NumEntries; // silence warning
+
+  const auto &Options = getPlugin().getOptions();
+  std::string CompilationOptions(Options.CompilationOptions + " " +
+                                 Options.UserCompilationOptions);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+  CompilationOptions += " " + Options.InternalCompilationOptions;
+  auto &Program = addProgram(ImageId, TgtImage);
+
+  int32_t RC = Program.buildModules(CompilationOptions);
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildModules %d", RC);
+
+  RC = Program.linkModules();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in linkModules %d", RC);
+
+  RC = Program.loadModuleKernels();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildKernels %d", RC);
+
+  return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+  // Ignoring for now
+  // TODO: call properly L0Program unload
+  return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                  bool ReleaseQueue) {
+  if (!ReleaseQueue) {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Support for ReleaseQueue=false in %s"
+                         " not implemented yet\n",
+                         __func__);
+  }
+  int32_t RC = synchronize(&AsyncInfo);
+  return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return false;
+
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (AsyncQueue->WaitEvents.empty())
+    return false;
+
+  return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return Plugin::success();
+
+  auto &Plugin = getPlugin();
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (!AsyncQueue->WaitEvents.empty())
+    return Plugin::success();
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  Plugin.releaseAsyncQueue(AsyncQueue);
+  getStagingBuffer().reset();
+  AsyncInfo.Queue = nullptr;
+
+  return Plugin::success();
+}
+
+void *L0DeviceTy::allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) {
+  return dataAlloc(Size, /*Align=*/0, Kind,
+                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+                   /*DevMalloc=*/false);
+}
+
+int L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+  return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
+}
+
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                   int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
+}
+
+Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                   void *DstPtr, int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+
+  L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev);
+  // Use copy engine only for across-tile/device copies.
+  const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
+
+  if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
+    if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+                            (__tgt_async_info *)AsyncInfoWrapper))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  } else {
+    if (enqueueMemCopy(DstPtr, SrcPtr, Size,
+                       /* AsyncInfo */ nullptr,
+                       /* Locked */ false, UseCopyEngine))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs<AsyncQueueTy *>();
+  if (!Queue) {
+    Queue = getPlugin().getAsyncQueue();
+    AsyncInfoWrapper.setQueueAs<AsyncQueueTy *>(Queue);
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
+  if (!Info->Context)
+    Info->Context = getZeContext();
+  if (!Info->Device)
+    Info->Device = reinterpret_cast<void *>(getZeDevice());
+  return Plugin::success();
+}
+
+Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
+  InfoTreeNode Info;
+  Info.add("Device Number", getDeviceId());
+  Info.add("Device Name", getNameCStr());
+  Info.add("Device PCI ID", getPCIId());
+  Info.add("Device UUID", getUuid().c_str());
+  Info.add("Number of total EUs", getNumEUs());
+  Info.add("Number of threads per EU", getNumThreadsPerEU());
+  Info.add("EU SIMD width", getSIMDWidth());
+  Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
+  Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
+  Info.add("Number of slices", getNumSlices());
+  Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
+  Info.add("Global memory size (bytes)", getGlobalMemorySize());
+  Info.add("Cache size (bytes)", getCacheSize());
+  Info.add("Max clock frequency (MHz)", getClockRate());
+  return Info;
+}
+
+Expected<GenericKernelTy &> L0DeviceTy::constructKernel(const char *Name) {
+  // Allocate and construct the L0 kernel.
+  L0KernelTy *L0Kernel = getPlugin().allocate<L0KernelTy>();
+  if (!L0Kernel)
+    return Plugin::error(ErrorCode::UNKNOWN,
+                         "Failed to allocate memory for L0 kernel");
+
+  new (L0Kernel) L0KernelTy(Name);
+
+  return *L0Kernel;
+}
+
+uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
+  ze_memory_allocation_properties_t properties = {
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+      nullptr,                // extension
+      ZE_MEMORY_TYPE_UNKNOWN, // type
+      0,                      // id
+      0,                      // page size
+  };
+
+  ze_result_t rc;
+  CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties,
+          nullptr);
+
+  if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT)
+    return ZE_MEMORY_TYPE_UNKNOWN;
+  else
+    return properties.type;
+}
+
+interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
+                                                   int32_t NumPrefers,
+                                                   interop_spec_t *Prefers) {
+  // no supported preference found, set default to level_zero, non-ordered
+  return interop_spec_t{
+      tgt_fr_level_zero, {forceInorderInterop() /*inorder*/, 0}, 0};
+}
+
+Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
+                                                 interop_spec_t &InteropSpec) {
+  auto Ret =
+      new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext);
+  Ret->fr_id = tgt_fr_level_zero;
+  Ret->vendor_id = omp_vendor_intel;
+
+  if (InteropContext == kmp_interop_type_target ||
+      InteropContext == kmp_interop_type_targetsync) {
+    Ret->device_info.Platform = getZeDriver();
+    Ret->device_info.Device = getZeDevice();
+    Ret->device_info.Context = getZeContext();
+  }
+
+  Ret->rtl_property = new L0Interop::Property();
+  if (InteropContext == kmp_interop_type_targetsync) {
+    Ret->async_info = new __tgt_async_info();
+    auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
+
+    bool InOrder = InteropSpec.attrs.inorder;
+    Ret->attrs.inorder = InOrder;
+    if (useImmForInterop()) {
+      auto CmdList = createImmCmdList(InOrder);
+      Ret->async_info->Queue = CmdList;
+      L0->ImmCmdList = CmdList;
+    } else {
+      Ret->async_info->Queue = createCommandQueue(InOrder);
+      L0->CommandQueue =
+          static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
+    }
+  }
+
+  return Ret;
+}
+
+Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
+  const auto DeviceId = getDeviceId();
+
+  if (!Interop || Interop->device_id != (intptr_t)DeviceId) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  if (Interop->async_info && Interop->async_info->Queue) {
+    if (useImmForInterop()) {
+      auto ImmCmdList = L0->ImmCmdList;
+      CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList);
+    } else {
+      auto CmdQueue = L0->CommandQueue;
+      CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue);
+    }
+  }
+  delete L0;
+  delete Interop;
+
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                                   __tgt_async_info *AsyncInfo, bool Locked,
+                                   bool UseCopyEngine) {
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  ze_event_handle_t Event = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    if (UseCopyEngine) {
+      CmdList = getCopyCmdList();
+      CmdQueue = getCopyCmdQueue();
+    } else {
+      CmdList = getCmdList();
+      CmdQueue = getCmdQueue();
+    }
+
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    if (Locked) {
+      CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                       nullptr);
+    } else {
+      CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                           CmdQueue, 1, &CmdList, nullptr);
+    }
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue non-blocking memory copy. This function is invoked only when IMM is
+/// fully enabled and async mode is requested.
+int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                                        __tgt_async_info *AsyncInfo,
+                                        bool CopyTo) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+  ze_event_handle_t SignalEvent = getEvent();
+  size_t NumWaitEvents = 0;
+  ze_event_handle_t *WaitEvents = nullptr;
+  AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
+  if (!AsyncQueue->WaitEvents.empty()) {
+    // Use a single wait event if events are ordered or a kernel event exists.
+    NumWaitEvents = 1;
+    if (Ordered)
+      WaitEvents = &AsyncQueue->WaitEvents.back();
+    else if (AsyncQueue->KernelEvent)
+      WaitEvents = &AsyncQueue->KernelEvent;
+    else
+      NumWaitEvents = 0;
+  }
+  auto CmdList = getImmCopyCmdList();
+  CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                   SignalEvent, NumWaitEvents, WaitEvents);
+  AsyncQueue->WaitEvents.push_back(SignalEvent);
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue memory fill
+int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+                                   size_t PatternSize, size_t Size) {
+  if (useImmForCopy()) {
+    const auto CmdList = getImmCopyCmdList();
+    auto Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    auto CmdList = getCopyCmdList();
+    const auto CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                     nullptr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                               int64_t PatternSize, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  // TODO: support async version
+  // TODO: convert enqueueMemFill to return Error code
+  if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
+    return Plugin::success();
+
+  return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+}
+
+void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+
+  const bool UseDedicatedPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  if (Kind == TARGET_ALLOC_DEFAULT) {
+    if (UserAlloc)
+      Kind = TARGET_ALLOC_DEVICE;
+    else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM)
+      Kind = TARGET_ALLOC_HOST;
+    else if (UseDedicatedPool)
+      Kind = TARGET_ALLOC_DEVICE;
+    else
+      Kind = getAllocKind();
+  }
+  auto &Allocator = getMemAllocator(Kind);
+  return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+                         MemAdvice, AllocOpt);
+}
+
+int32_t L0DeviceTy::dataDelete(void *Ptr) {
+  auto &Allocator = getMemAllocator(Ptr);
+  return Allocator.dealloc(Ptr);
+}
+
+int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+  ze_result_t RC;
+  CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
+          Size);
+  if (RC != ZE_RESULT_SUCCESS) {
+    DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
+       ".\n",
+       DPxPTR(Mem), DPxPTR(getZeDevice()));
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Command queues related functions
+/// Create a command list with given ordinal and flags
+ze_command_list_handle_t L0DeviceTy::createCmdList(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    ze_command_list_flags_t Flags, const std::string &DeviceIdStr) {
+  ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+                                        nullptr, // extension
+                                        Ordinal, Flags};
+  ze_command_list_handle_t cmdList;
+  CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
+                   &cmdList);
+  DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
+     ") for device %s.\n",
+     DPxPTR(cmdList), Ordinal, DeviceIdStr.c_str());
+  return cmdList;
+}
+
+/// Create a command list with default flags
+ze_command_list_handle_t
+L0DeviceTy::createCmdList(ze_context_handle_t Context,
+                          ze_device_handle_t Device, uint32_t Ordinal,
+                          const std::string &DeviceIdStr) {
+  return (Ordinal == UINT32_MAX)
+             ? nullptr
+             : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
+}
+
+ze_command_list_handle_t L0DeviceTy::getCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getCmdList();
+  if (!CmdList) {
+    CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
+                            getZeId());
+    TLS.setCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+/// Create a command queue with given ordinal and flags
+ze_command_queue_handle_t
+L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
+                           ze_device_handle_t Device, uint32_t Ordinal,
+                           uint32_t Index, ze_command_queue_flags_t Flags,
+                           const std::string &DeviceIdStr) {
+  ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                                          nullptr, // extension
+                                          Ordinal,
+                                          Index,
+                                          Flags, // flags
+                                          ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                                          ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_queue_handle_t cmdQueue;
+  CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+                   &cmdQueue);
+  DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
+     ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.c_str());
+  return cmdQueue;
+}
+
+/// Create a command queue with default flags
+ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    uint32_t Index, const std::string &DeviceIdStr, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  return (Ordinal == UINT32_MAX) ? nullptr
+                                 : createCmdQueue(Context, Device, Ordinal,
+                                                  Index, Flags, DeviceIdStr);
+}
+
+/// Create a new command queue for the given OpenMP device ID
+ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+  auto cmdQueue =
+      createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
+                     getComputeIndex(), getZeId(), InOrder);
+  return cmdQueue;
+}
+
+/// Create an immediate command list
+ze_command_list_handle_t
+L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                               nullptr,
+                               Ordinal,
+                               Index,
+                               Flags,
+                               ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                               ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_list_handle_t CmdList = nullptr;
+  CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+                   &Desc, &CmdList);
+  DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
+     ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
+  return CmdList;
+}
+
+/// Create an immediate command list for copying
+ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+  uint32_t Ordinal = getMainCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getLinkCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getComputeEngine();
+  return createImmCmdList(Ordinal, /*Index*/ 0);
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+  auto &TLS = getTLS();
+  auto CmdQueue = TLS.getCmdQueue();
+  if (!CmdQueue) {
+    CmdQueue = createCommandQueue();
+    TLS.setCmdQueue(CmdQueue);
+  }
+  return CmdQueue;
+}
+
+ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getCopyCmdList();
+    if (!CmdList) {
+      CmdList = createCmdList(getZeContext(), getZeDevice(),
+                              getMainCopyEngine(), getZeId());
+      TLS.setCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getCopyCmdQueue();
+    if (!CmdQueue) {
+      CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
+                                getMainCopyEngine(), 0, getZeId());
+      TLS.setCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getLinkCopyCmdList();
+    if (!CmdList) {
+      CmdList =
+          createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                        ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getLinkCopyCmdQueue();
+    if (!CmdQueue) {
+      // Try to use different copy engines for multiple threads
+      uint32_t Index =
+          __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
+      CmdQueue =
+          createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                         Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCmdList();
+  if (!CmdList) {
+    CmdList = createImmCmdList();
+    TLS.setImmCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCopyCmdList();
+  if (!CmdList) {
+    CmdList = createImmCopyCmdList();
+    TLS.setImmCopyCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+
+  // Nothing to do if everything is ordered
+  if (Ordered)
+    return Plugin::success();
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = getImmCopyCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+  } else {
+    CmdList = getCopyCmdList();
+    CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
new file mode 100644
index 0000000000000..06f01f23285fc
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -0,0 +1,134 @@
+//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===//
+//
+// Implement wrapper for level_zero API calls through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <memory>
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(zeInit, 1)
+DLWRAP(zeDriverGet, 2)
+DLWRAP(zeDeviceGet, 3)
+DLWRAP(zeDeviceGetSubDevices, 3)
+DLWRAP(zeModuleCreate, 5)
+DLWRAP(zeModuleGetProperties, 2)
+DLWRAP(zeModuleBuildLogDestroy, 1)
+DLWRAP(zeModuleBuildLogGetString, 3)
+DLWRAP(zeModuleGetKernelNames, 3)
+DLWRAP(zeModuleDestroy, 1)
+DLWRAP(zeCommandListAppendBarrier, 4)
+DLWRAP(zeCommandListAppendLaunchKernel, 6)
+DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6)
+DLWRAP(zeCommandListAppendMemoryCopy, 7)
+DLWRAP(zeCommandListAppendMemoryCopyRegion, 12)
+DLWRAP(zeCommandListAppendMemoryFill, 8)
+DLWRAP(zeCommandListAppendMemoryPrefetch, 3)
+DLWRAP(zeCommandListAppendMemAdvise, 5)
+DLWRAP(zeCommandListClose, 1)
+DLWRAP(zeCommandListCreate, 4)
+DLWRAP(zeCommandListCreateImmediate, 4)
+DLWRAP(zeCommandListDestroy, 1)
+DLWRAP(zeCommandListReset, 1)
+DLWRAP(zeCommandQueueCreate, 4)
+DLWRAP(zeCommandQueueDestroy, 1)
+DLWRAP(zeCommandQueueExecuteCommandLists, 4)
+DLWRAP(zeCommandQueueSynchronize, 2)
+DLWRAP(zeContextCreate, 3)
+DLWRAP(zeContextDestroy, 1)
+DLWRAP(zeContextMakeMemoryResident, 4)
+DLWRAP(zeDeviceCanAccessPeer, 3)
+DLWRAP(zeDeviceGetProperties, 2)
+DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3)
+DLWRAP(zeDeviceGetComputeProperties, 2)
+DLWRAP(zeDeviceGetMemoryProperties, 3)
+DLWRAP(zeDeviceGetCacheProperties, 3)
+DLWRAP(zeDeviceGetGlobalTimestamps, 3)
+DLWRAP(zeDriverGetApiVersion, 2)
+DLWRAP(zeDriverGetExtensionFunctionAddress, 3)
+DLWRAP(zeDriverGetExtensionProperties, 3)
+DLWRAP(zeEventCreate, 3)
+DLWRAP(zeEventDestroy, 1)
+DLWRAP(zeEventHostReset, 1)
+DLWRAP(zeEventHostSynchronize, 2)
+DLWRAP(zeEventPoolCreate, 5)
+DLWRAP(zeEventPoolDestroy, 1)
+DLWRAP(zeEventQueryKernelTimestamp, 2)
+DLWRAP(zeFenceCreate, 3)
+DLWRAP(zeFenceDestroy, 1)
+DLWRAP(zeFenceHostSynchronize, 2)
+DLWRAP(zeKernelCreate, 3)
+DLWRAP(zeKernelDestroy, 1)
+DLWRAP(zeKernelGetName, 3)
+DLWRAP(zeKernelGetProperties, 2)
+DLWRAP(zeKernelSetArgumentValue, 4)
+DLWRAP(zeKernelSetGroupSize, 4)
+DLWRAP(zeKernelSetIndirectAccess, 2)
+DLWRAP(zeKernelSuggestGroupSize, 7)
+DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2)
+DLWRAP(zeMemAllocDevice, 6)
+DLWRAP(zeMemAllocHost, 5)
+DLWRAP(zeMemAllocShared, 7)
+DLWRAP(zeMemFree, 2)
+DLWRAP(zeMemGetAddressRange, 4)
+DLWRAP(zeMemGetAllocProperties, 4)
+DLWRAP(zeModuleDynamicLink, 3)
+DLWRAP(zeModuleGetGlobalPointer, 4)
+DLWRAP(zesDeviceEnumMemoryModules, 3)
+DLWRAP(zesMemoryGetState, 2)
+
+DLWRAP_FINALIZE()
+
+#ifndef LEVEL_ZERO_LIBRARY
+#error "Level zero library not defined"
+#endif
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadLevelZero() {
+  const char *L0Library = LEVEL_ZERO_LIBRARY;
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", L0Library);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, L0Library);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) {
+  if (!loadLevelZero())
+    return ZE_RESULT_ERROR_UNKNOWN;
+  return dlwrap_zeInit(flags);
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
new file mode 100644
index 0000000000000..d1cb0b7bd50bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                             KernelArgsTy &KernelArgs,
+                             KernelLaunchParamsTy LaunchParams,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+  int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+                                   std::move(LaunchParams), AsyncInfoWrapper);
+  if (RC == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  return Plugin::error(error::ErrorCode::UNKNOWN,
+                       "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+  const auto *KernelName = getName();
+
+  auto Module = Program.findModuleFromKernelName(KernelName);
+  ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+                                 KernelName};
+  CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+  return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+                           DeviceImageTy &Image) {
+  auto &Program = L0ProgramTy::makeL0Program(Image);
+
+  Error Err = buildKernel(Program);
+  if (Err)
+    return Err;
+  Program.addKernel(this);
+
+  return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+  int ThrLimit;
+  ThrLimit = omp_get_teams_thread_limit();
+  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+  // omp_get_thread_limit() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t ThreadLimit =
+      (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+          ? ThrLimit
+          : 0;
+
+  int NTeams = omp_get_max_teams();
+  DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+  // omp_get_max_teams() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t NumTeams =
+      (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+                                                                      : 0;
+
+  return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+    TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+    ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool IsTeamsNDRange) const {
+
+  const KernelPropertiesTy &KernelPR = getProperties();
+
+  const auto DeviceId = Device.getDeviceId();
+  bool MaxGroupSizeForced = false;
+  bool MaxGroupCountForced = false;
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+  const auto &Option = LevelZeroPluginTy::getOptions();
+  const auto OptSubscRate = Option.SubscriptionRate;
+
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+  if (KernelMaxThreadGroupSize < MaxGroupSize) {
+    MaxGroupSize = KernelMaxThreadGroupSize;
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Capping maximum team size to %" PRIu32
+         " due to kernel constraints.\n",
+         MaxGroupSize);
+  }
+
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t MaxGroupCount = 0;
+  if (NumTeams > 0) {
+    MaxGroupCount = NumTeams;
+    MaxGroupCountForced = true;
+  }
+
+  if (MaxGroupCountForced) {
+    // If number of teams is specified by the user, then use KernelWidth
+    // WIs per WG by default, so that it matches
+    // decideLoopKernelGroupArguments() behavior.
+    if (!MaxGroupSizeForced) {
+      MaxGroupSize = KernelWidth;
+    }
+  } else {
+    const uint32_t NumSubslices = Device.getNumSubslices();
+    uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+    if (HalfNumThreads)
+      NumThreadsPerSubslice /= 2;
+
+    MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+    if (MaxGroupSizeForced) {
+      // Set group size for the HW capacity
+      uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+      uint32_t NumGroupsPerSubslice =
+          (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+      MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+    } else {
+      assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+      assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+             "Invalid maxGroupSize");
+      // Maximize group size
+      while (MaxGroupSize >= KernelWidth) {
+        uint32_t NumThreadsPerGroup =
+            (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+        if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+          uint32_t NumGroupsPerSubslice =
+              NumThreadsPerSubslice / NumThreadsPerGroup;
+          MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+          break;
+        }
+        MaxGroupSize -= KernelWidth;
+      }
+    }
+  }
+
+  uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  bool UsedReductionSubscriptionRate = false;
+  if (!MaxGroupCountForced) {
+    { GRPCounts[0] *= OptSubscRate; }
+
+    size_t LoopTripcount = 0;
+    if (LoopLevels) {
+      // TODO: consider other possible LoopDesc uses
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Loop desciptor provided but specific ND-range is disabled\n");
+      // TODO: get rid of this constraint
+      if (LoopLevels->NumLoops > 1) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+             LoopLevels->NumLoops);
+      } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+        LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+                         LoopLevels->Levels[0].Stride) /
+                        LoopLevels->Levels[0].Stride;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+             " = %zu\n",
+             LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+             LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+             LoopTripcount);
+      }
+    }
+
+    if (LoopTripcount && !UsedReductionSubscriptionRate) {
+      const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+                                     Device.getNumSubslices() * SIMDWidth;
+      size_t AdjustedGroupCount =
+          IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+                                      MaxTotalThreads / GRPSizes[0])
+                         : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+      AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+      AdjustedGroupCount *= OptSubscRate;
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Adjusting number of teams using the loop tripcount\n");
+      if (AdjustedGroupCount < GRPCounts[0])
+        GRPCounts[0] = AdjustedGroupCount;
+    }
+  }
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+                                     const uint32_t (&GroupSizes)[3],
+                                     uint32_t SIMDWidth) {
+  uint64_t GroupCount[3];
+  for (int I = 0; I < 3; ++I) {
+    if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[I] =
+        (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+    if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+      return (std::numeric_limits<uint64_t>::max)();
+  }
+  for (int I = 1; I < 3; ++I) {
+    if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[0] *= GroupCount[I];
+  }
+  // Multiplication of the group sizes must never overflow uint64_t
+  // for any existing device.
+  uint64_t LocalWorkSize =
+      uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+  uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+  // Check that the total number of threads fits uint64_t.
+  if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+    return (std::numeric_limits<uint64_t>::max)();
+
+  return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+    uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool &AllowCooperative) const {
+
+  const auto DeviceId = Device.getDeviceId();
+  const auto &Options = LevelZeroPluginTy::getOptions();
+  const auto &KernelPR = getProperties();
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+  bool MaxGroupSizeForced = false;
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t GRPCounts[3] = {1, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  TgtLoopDescTy *Levels = LoopLevels->Levels;
+  int32_t DistributeDim = LoopLevels->DistributeDim;
+  assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+         "Invalid distribute dimension.");
+  int32_t NumLoops = LoopLevels->NumLoops;
+  assert((NumLoops > 0 && NumLoops <= 3) &&
+         "Invalid loop nest description for ND partitioning");
+
+  // Compute global widths for X/Y/Z dimensions.
+  size_t TripCounts[3] = {1, 1, 1};
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+         ", Stride = %" PRId64 "\n",
+         I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+    if (Levels[I].Ub < Levels[I].Lb)
+      TripCounts[I] = 0;
+    else
+      TripCounts[I] =
+          (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+  }
+
+  // Check if any of the loop has zero iterations.
+  if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+    std::fill(GroupSizes, GroupSizes + 3, 1);
+    std::fill(GRPCounts, GRPCounts + 3, 1);
+    if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+      // There is a distribute dimension, and the distribute loop
+      // has non-zero iterations, but some inner parallel loop
+      // has zero iterations. We still want to split the distribute
+      // loop's iterations between many WGs (of size 1), but the inner/lower
+      // dimensions should be 1x1.
+      // Note that this code is currently dead, because we are not
+      // hoisting the inner loops' bounds outside of the target regions.
+      // The code is here just for completeness.
+      size_t DistributeTripCount = TripCounts[DistributeDim];
+      if (DistributeTripCount > UINT32_MAX) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Invalid number of teams %zu due to large loop trip count\n",
+             DistributeTripCount);
+        return OFFLOAD_FAIL;
+      }
+      GRPCounts[DistributeDim] = DistributeTripCount;
+    }
+    AllowCooperative = false;
+    GroupCounts.groupCountX = GRPCounts[0];
+    GroupCounts.groupCountY = GRPCounts[1];
+    GroupCounts.groupCountZ = GRPCounts[2];
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (!MaxGroupSizeForced) {
+    // Use zeKernelSuggestGroupSize to compute group sizes,
+    // or fallback to setting dimension 0 width to SIMDWidth.
+    // Note that in case of user-specified LWS GRPSizes[0]
+    // is already set according to the specified value.
+    size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+    if (DistributeDim > 0) {
+      // There is a distribute dimension.
+      GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+      GlobalSizes[DistributeDim] = 1;
+    }
+
+    {
+      if (MaxGroupSize > KernelPR.Width) {
+        GRPSizes[0] = KernelPR.Width;
+      }
+      if (DistributeDim == 0) {
+        // If there is a distribute dimension, then we do not use
+        // thin HW threads, since we do not know anything about
+        // the iteration space of the inner parallel loop regions.
+        //
+        // If there is no distribute dimension, then try to use thiner
+        // HW threads to get more independent HW threads executing
+        // the kernel - this may allow more parallelism due to
+        // the stalls being distributed across multiple HW threads rather
+        // than across SIMD lanes within one HW thread.
+        assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+               "Unexpected team sizes for dimensions 1 or/and 2.");
+        uint32_t SimdWidth = KernelPR.SIMDWidth;
+        uint64_t TotalThreads = Device.getTotalThreads();
+        TotalThreads *= Options.ThinThreadsThreshold;
+
+        uint32_t GRPSizePrev = GRPSizes[0];
+        uint64_t ThreadsNeeded =
+            computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        while (ThreadsNeeded < TotalThreads) {
+          GRPSizePrev = GRPSizes[0];
+          // Try to half the local work size (if possible) and see
+          // how many HW threads the kernel will require with this
+          // new local work size.
+          // In most implementations the initial GRPSizes[0]
+          // will be a power-of-two.
+          if (GRPSizes[0] <= 1)
+            break;
+          GRPSizes[0] >>= 1;
+          ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        }
+        GRPSizes[0] = GRPSizePrev;
+      }
+    }
+  }
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    if (I < DistributeDim) {
+      GRPCounts[I] = 1;
+      continue;
+    }
+    size_t Trip = TripCounts[I];
+    if (GRPSizes[I] >= Trip)
+      GRPSizes[I] = Trip;
+    size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+    if (Count > UINT32_MAX) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Invalid number of teams %zu due to large loop trip count\n", Count);
+      return OFFLOAD_FAIL;
+    }
+    GRPCounts[I] = (uint32_t)Count;
+  }
+  AllowCooperative = false;
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                                   int32_t ThreadLimit, uint32_t *GroupSizes,
+                                   ze_group_count_t &GroupCounts,
+                                   void *LoopDesc,
+                                   bool &AllowCooperative) const {
+
+  const auto SubId = SubDevice.getDeviceId();
+  const auto &KernelPR = getProperties();
+
+  // Detect if we need to reduce available HW threads. We need this adjustment
+  // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  static std::once_flag OnceFlag;
+  static bool ZeDebugEnabled = false;
+  std::call_once(OnceFlag, []() {
+    const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
+    if (EnvVal && std::atoi(EnvVal) == 1)
+      ZeDebugEnabled = true;
+  });
+
+  // Read the most recent global thread limit and max teams.
+  auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+
+  bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+  bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+  assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+  if (ThreadLimit > 0) {
+    // use thread_limit clause value default
+    DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+       ThreadLimit);
+  } else if (ThreadLimitICV > 0) {
+    // else use thread-limit-var ICV
+    ThreadLimit = ThreadLimitICV;
+    DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+  }
+
+  size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+  // Set correct max group size if the kernel was compiled with explicit SIMD
+  if (SIMDWidth == 1) {
+    MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+  }
+
+  if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+    MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+    DP("Capping maximum team size to %zu due to kernel constraints.\n",
+       MaxThreadLimit);
+  }
+
+  if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+    ThreadLimit = MaxThreadLimit;
+    DP("Max team size execceds current maximum %zu. Adjusted\n",
+       MaxThreadLimit);
+  }
+  {
+    if (NumTeams > 0) {
+      DP("Number of teams is set to %" PRId32
+         "(num_teams clause or no teams construct)\n",
+         NumTeams);
+    } else if (NumTeamsICV > 0) {
+      // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+           "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+      NumTeams = NumTeamsICV;
+      DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+         NumTeams);
+    }
+
+    bool UseLoopTC = LoopDesc;
+    decideKernelGroupArguments(
+        SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+        UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+        GroupCounts, HalfNumThreads, false);
+    AllowCooperative = false;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+                                        KernelArgsTy &KernelArgs,
+                                        KernelLaunchParamsTy LaunchParams,
+                                        __tgt_async_info *AsyncInfo) const {
+  // Libomptarget can pass negative NumTeams and ThreadLimit now after
+  // introducing __tgt_target_kernel. This happens only when we have valid
+  // LoopDesc and the region is not a teams region.
+
+  auto zeKernel = getZeKernel();
+  auto DeviceId = l0Device.getDeviceId();
+  int32_t NumArgs = KernelArgs.NumArgs;
+  int32_t NumTeams = KernelArgs.NumTeams[0];
+  int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+  void *LoopDesc = nullptr;
+
+  if (NumTeams < 0)
+    NumTeams = 0;
+  if (ThreadLimit < 0)
+    ThreadLimit = 0;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+  auto &Plugin = l0Device.getPlugin();
+  auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+  auto *IdStr = Device.getZeIdCStr();
+  auto &Options = LevelZeroPluginTy::getOptions();
+  bool IsAsync = AsyncInfo && Device.asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto *AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+  // We need to get a non-const version of the Properties structure in order to
+  // use its lock and be able to cache the group params and indirect flags
+  auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+  // Protect from kernel preparation to submission as kernels are shared.
+  std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+  // Decide group sizes and counts
+  uint32_t GroupSizes[3];
+  ze_group_count_t GroupCounts;
+
+  bool AllowCooperative = false;
+
+  // Check if we can reuse previous group parameters
+  bool GroupParamsReused = KernelPR.reuseGroupParams(
+      static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+      GroupSizes, GroupCounts, AllowCooperative);
+
+  if (!GroupParamsReused) {
+    auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+                             GroupCounts, LoopDesc, AllowCooperative);
+
+    if (RC != OFFLOAD_SUCCESS) {
+      return RC;
+    }
+
+    KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+                              NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+                              AllowCooperative);
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+       GroupSizes[1], GroupSizes[2]);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+       GroupCounts.groupCountX, GroupCounts.groupCountY,
+       GroupCounts.groupCountZ);
+  for (int32_t I = 0; I < NumArgs; I++) {
+    {
+      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+      CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+                       Arg == nullptr ? nullptr : &Arg);
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+           ") was set successfully for device %s.\n",
+           I, DPxPTR(Arg), IdStr);
+    }
+  }
+
+  // Set Kernel Indirect flags
+  auto &PrevFlags = KernelPR.IndirectAccessFlags;
+  ze_kernel_indirect_access_flags_t Flags = 0;
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+  if (PrevFlags != Flags) {
+    // Combine with common access flags
+    const auto FinalFlags = Device.getIndirectFlags() | Flags;
+    CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+    PrevFlags = Flags;
+  }
+
+  if (!GroupParamsReused) {
+    CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+                     GroupSizes[1], GroupSizes[2]);
+  }
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  const bool UseImmCmdList = Device.useImmForCompute();
+
+  if (UseImmCmdList) {
+    CmdList = Device.getImmCmdList();
+    // Command queue is not used with immediate command list
+  } else {
+    CmdList = Device.getCmdList();
+    CmdQueue = Device.getCmdQueue();
+  }
+
+  if (UseImmCmdList) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Using immediate command list for kernel submission.\n");
+    auto Event = Device.getEvent();
+    size_t NumWaitEvents = 0;
+    ze_event_handle_t *WaitEvents = nullptr;
+    if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+      if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+        NumWaitEvents = 1;
+        WaitEvents = &AsyncQueue->WaitEvents.back();
+      } else {
+        NumWaitEvents = AsyncQueue->WaitEvents.size();
+        WaitEvents = AsyncQueue->WaitEvents.data();
+      }
+    }
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, NumWaitEvents,
+                       WaitEvents);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, NumWaitEvents, WaitEvents);
+    KernelLock.unlock();
+    if (IsAsync) {
+      AsyncQueue->WaitEvents.push_back(Event);
+      AsyncQueue->KernelEvent = Event;
+    } else {
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+      Device.releaseEvent(Event);
+    }
+  } else {
+    ze_event_handle_t Event = nullptr;
+    KernelLock.unlock();
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    if (Event) {
+      Device.releaseEvent(Event);
+    }
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
new file mode 100644
index 0000000000000..790acdd9f568f
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -0,0 +1,637 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+  if (isFull())
+    return nullptr;
+  if (FreeSlot != UINT32_MAX) {
+    const uint32_t Slot = FreeSlot;
+    FreeSlot = UINT32_MAX;
+    UsedSlots[Slot] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+  }
+  for (uint32_t I = 0; I < NumSlots; I++) {
+    if (UsedSlots[I])
+      continue;
+    UsedSlots[I] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + I * ChunkSize);
+  }
+  // Should not reach here.
+  assert(0 && "Inconsistent memory pool state");
+  return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+  if (!contains(Mem))
+    assert(0 && "Inconsistent memory pool state");
+  const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+  UsedSlots[Slot] = false;
+  NumUsedSlots--;
+  FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = Kind;
+  Allocator = _Allocator;
+
+  // Read user-defined options
+  const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+  const size_t UserAllocMax = UserOptions[0];
+  const size_t UserCapacity = UserOptions[1];
+  const size_t UserPoolSize = UserOptions[2];
+
+  BlockCapacity = UserCapacity;
+  PoolSizeMax = UserPoolSize << 20; // MB to B
+  PoolSize = 0;
+
+  auto Context = Allocator->L0Context->getZeContext();
+  const auto Device = Allocator->Device;
+
+  // Check page size used for this allocation kind to decide minimum
+  // allocation size when allocating from L0.
+  void *Mem = Allocator->allocL0(8, 0, AllocKind);
+  ze_memory_allocation_properties_t AP{
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+      ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+  CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+  AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+  CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+  bool IsDiscrete = false;
+  if (Device) {
+    ze_device_properties_t Properties{};
+    Properties.deviceId = 0;
+    Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    Properties.pNext = nullptr;
+    CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+    IsDiscrete = Device->isDiscreteDevice();
+
+    if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+      // Use page size as minimum chunk size for USM shared on discrete
+      // device.
+      // FIXME: pageSize is not returned correctly (=0) on some new devices,
+      //        so use fallback value for now.
+      AllocMin = (std::max)(AP.pageSize, AllocUnit);
+      AllocUnit = AllocMin * BlockCapacity;
+    }
+  }
+
+  // Convert MB to B and round up to power of 2
+  AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+  if (AllocMin >= AllocMax) {
+    AllocMax = 2 * AllocMin;
+    DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+       "requirements.\n",
+       AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+  }
+  assert(AllocMin < AllocMax &&
+         "Invalid parameters while initializing memory pool");
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+
+  // Set bucket parameters
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    size_t BlockSize = ChunkSize * BlockCapacity;
+    // On discrete device, the cost of native L0 invocation doubles when the
+    // the requested size doubles after certain threshold, so allocating
+    // larger block does not pay off at all. It is better to keep a single
+    // chunk in a single block in such cases.
+    if (BlockSize <= AllocUnit) {
+      BlockSize = AllocUnit; // Allocation unit is already large enough
+    } else if (IsDiscrete) {
+      // Do not preallocate if it does not pay off
+      if (ChunkSize >= L0UsmPreAllocThreshold ||
+          (AllocKind == TARGET_ALLOC_HOST &&
+           ChunkSize >= L0HostUsmPreAllocThreshold))
+        BlockSize = ChunkSize;
+    }
+    BucketParams.emplace_back(ChunkSize, BlockSize);
+  }
+
+  DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+     "AllocMax = %zu, "
+     "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+     ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+     BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMin = AllocUnit = 1024 << 6; // 64KB
+  AllocMax = Option.ReductionPoolInfo[0] << 20;
+  BlockCapacity = Option.ReductionPoolInfo[1];
+  PoolSize = 0;
+  PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+  }
+
+  DP("Initialized reduction scratch pool for device " DPxMOD
+     ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMax = AllocMin;
+  BlockCapacity = AllocUnit / AllocMax;
+  PoolSize = 0;
+  PoolSizeMax = (1 << 20); // this should be sufficiently large
+  Buckets.resize(1);
+  BucketStats.resize(1, {0, 0});
+  BucketParams.emplace_back(AllocMax, AllocUnit);
+  ZeroInit = true;
+  ZeroInitValue.resize(AllocUnit, 0);
+  DP("Initialized zero-initialized reduction counter pool for "
+     "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+  auto PrintNum = [](uint64_t Num) {
+    if (Num > 1e9)
+      fprintf(stderr, "%11.2e", float(Num));
+    else
+      fprintf(stderr, "%11" PRIu64, Num);
+  };
+
+  bool HasPoolAlloc = false;
+  for (auto &Stat : BucketStats) {
+    if (Stat.first > 0 || Stat.second > 0) {
+      HasPoolAlloc = true;
+      break;
+    }
+  }
+
+  DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+     DPxPTR(Allocator->Device));
+
+  if (HasPoolAlloc) {
+    DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+       AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+    DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+    for (size_t I = 0; I < Buckets.size(); I++) {
+      const auto &Stat = BucketStats[I];
+      if (Stat.first > 0 || Stat.second > 0) {
+        DP("-- Bucket[%10zu]:", BucketParams[I].first);
+        PrintNum(Stat.first);
+        PrintNum(Stat.second);
+        fprintf(stderr, "%11.2f\n",
+                float(Stat.second) / float(Stat.first + Stat.second) * 100);
+      }
+    }
+  } else {
+    DP("-- Not used\n");
+  }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+  const int DebugLevel = getDebugLevel();
+  if (DebugLevel > 0)
+    printUsage();
+  for (auto &Bucket : Buckets) {
+    for (auto *Block : Bucket) {
+      if (DebugLevel > 0)
+        Allocator->log(0, Block->Size, AllocKind);
+      CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+                       reinterpret_cast<void *>(Block->Base));
+      delete Block;
+    }
+  }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+  if (Size == 0 || Size > AllocMax)
+    return nullptr;
+
+  const uint32_t BucketId = getBucketId(Size);
+  auto &Blocks = Buckets[BucketId];
+  void *Mem = nullptr;
+
+  for (auto *Block : Blocks) {
+    if (Block->isFull())
+      continue;
+    Mem = Block->alloc();
+    assert(Mem && "Inconsistent state while allocating memory from pool");
+    PtrToBlock.emplace(Mem, Block);
+    break;
+  }
+
+  if (Mem == nullptr) {
+    const bool IsSmallAllocatable =
+        (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+    const bool IsFull = (PoolSize > PoolSizeMax);
+    if (IsFull && !IsSmallAllocatable)
+      return nullptr;
+    // Bucket is empty or all blocks in the bucket are full
+    const auto ChunkSize = BucketParams[BucketId].first;
+    const auto BlockSize = BucketParams[BucketId].second;
+    void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+    if (ZeroInit) {
+      auto RC =
+          Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Failed to zero-initialize pool memory\n");
+        return nullptr;
+      }
+    }
+
+    BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
+    Blocks.push_back(Block);
+    Mem = Block->alloc();
+    PtrToBlock.emplace(Mem, Block);
+    if (IsFull)
+      SmallPoolSize += BlockSize;
+    else
+      PoolSize += BlockSize;
+    DP("New block allocation for %s pool: base = " DPxMOD
+       ", size = %zu, pool size = %zu\n",
+       ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+    BucketStats[BucketId].first++;
+  } else {
+    BucketStats[BucketId].second++;
+  }
+
+  AllocSize = (AllocMin << BucketId);
+
+  return Mem;
+}
+
+/// Deallocate the specified memory and returns block size deallocated.
+size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
+  if (PtrToBlock.count(Ptr) == 0)
+    return 0;
+  PtrToBlock[Ptr]->dealloc(Ptr);
+  const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize;
+  PtrToBlock.erase(Ptr);
+  return Deallocated;
+}
+
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
+                                            int32_t Kind, bool InPool,
+                                            bool ImplicitArg) {
+  const auto Inserted =
+      Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+  // Check if we keep valid disjoint memory ranges.
+  [[maybe_unused]] bool Valid = Inserted.second;
+  if (Valid) {
+    if (Inserted.first != Map.begin()) {
+      const auto I = std::prev(Inserted.first, 1);
+      Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+    }
+    if (Valid) {
+      const auto I = std::next(Inserted.first, 1);
+      if (I != Map.end())
+        Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+    }
+  }
+  assert(Valid && "Invalid overlapping memory allocation");
+  if (ImplicitArg)
+    NumImplicitArgs[Kind]++;
+}
+
+/// Remove allocation information for the given memory location
+bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
+                                               MemAllocInfoTy *Removed) {
+  const auto AllocInfo = Map.find(Ptr);
+  if (AllocInfo == Map.end())
+    return false;
+  if (AllocInfo->second.ImplicitArg)
+    NumImplicitArgs[AllocInfo->second.Kind]--;
+  if (Removed)
+    *Removed = AllocInfo->second;
+  Map.erase(AllocInfo);
+  return true;
+}
+
+void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+                                     const L0OptionsTy &Option) {
+  SupportsLargeMem = L0Device.supportsLargeMem();
+  IsHostMem = false;
+  Device = &L0Device;
+  L0Context = &L0Device.getL0Context();
+  for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
+    if (Option.MemPoolInfo.count(Kind) > 0) {
+      std::lock_guard<std::mutex> Lock(Mtx);
+      Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::forward_as_tuple(Kind, this, Option));
+    }
+    if (getDebugLevel() > 0)
+      Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::tuple<>{});
+  }
+  ReductionPool = std::make_unique<MemPoolTy>(this, Option);
+  CounterPool = std::make_unique<MemPoolTy>(this);
+  updateMaxAllocSize(L0Device);
+}
+
+void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+                                  const L0OptionsTy &Option) {
+  SupportsLargeMem = Driver.supportsLargeMem();
+  IsHostMem = true;
+  this->L0Context = &Driver;
+  if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    Pools.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST),
+                  std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+  }
+  if (getDebugLevel() > 0)
+    Stats.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+}
+
+void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
+  // Update the maximum allocation size for this Allocator
+  ze_device_properties_t P;
+  P.maxMemAllocSize = 0;
+  P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  P.pNext = nullptr;
+  CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+
+  if (IsHostMem) {
+    // MaxAllocSize should be the minimum of all devices from the driver
+    if (MaxAllocSize > P.maxMemAllocSize) {
+      MaxAllocSize = P.maxMemAllocSize;
+      DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
+         DPxPTR(L0Context), MaxAllocSize);
+    }
+    return;
+  }
+
+  MaxAllocSize = P.maxMemAllocSize;
+  DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
+     MaxAllocSize);
+}
+
+/// Release resources and report statistics if requested
+void MemAllocatorTy::deinit() {
+  std::lock_guard<std::mutex> Lock(Mtx);
+  // Release RTL-owned memory
+  for (auto *M : MemOwned)
+    dealloc_locked(M);
+  // Release resources used in the pool
+  Pools.clear();
+  ReductionPool.reset(nullptr);
+  CounterPool.reset(nullptr);
+  // Report memory usage if requested
+  if (getDebugLevel() > 0) {
+    for (auto &Stat : Stats) {
+      DP("Memory usage for %s, device " DPxMOD "\n",
+         ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+      const auto &ST = Stat.second;
+      if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+        DP("-- Not used\n");
+        continue;
+      }
+      DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
+      DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
+      DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
+      DP("-- Freed    : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
+      DP("-- InUse    : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
+      DP("-- PeakUse  : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
+      DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+    }
+  }
+
+  // mark as deinitialized
+  L0Context = nullptr;
+}
+
+/// Allocate memory with the specified information
+void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+  assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
+          Kind == TARGET_ALLOC_SHARED) &&
+         "Unknown memory kind while allocating target memory");
+
+  std::lock_guard<std::mutex> Lock(Mtx);
+
+  // We do not expect meaningful Align parameter when Offset > 0, so the
+  // following code does not handle such case.
+
+  size_t AllocSize = Size + Offset;
+  void *Mem = nullptr;
+  void *AllocBase = nullptr;
+  const bool UseScratchPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH);
+  const bool UseZeroInitPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
+
+  if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+    // Pool is enabled for the allocation kind, and we do not use any memory
+    // advice. We should avoid using pool if there is any meaningful memory
+    // advice not to affect sibling allocation in the same block.
+    if (Align > 0)
+      AllocSize += (Align - 1);
+    size_t PoolAllocSize = 0;
+    if (UseScratchPool)
+      AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+    else if (UseZeroInitPool)
+      AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+    else
+      AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+    if (AllocBase) {
+      uintptr_t Base = (uintptr_t)AllocBase;
+      if (Align > 0)
+        Base = (Base + Align) & ~(Align - 1);
+      Mem = (void *)(Base + Offset);
+      AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+      log(Size, PoolAllocSize, Kind, true /* Pool */);
+      if (DevMalloc)
+        MemOwned.push_back(AllocBase);
+      if (UseDedicatedPool) {
+        DP("Allocated %zu bytes from %s pool\n", Size,
+           UseScratchPool ? "scratch" : "zero-initialized");
+      }
+      return Mem;
+    }
+  }
+
+  AllocBase = allocL0(AllocSize, Align, Kind, Size);
+  if (AllocBase) {
+    Mem = (void *)((uintptr_t)AllocBase + Offset);
+    AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+    if (DevMalloc)
+      MemOwned.push_back(AllocBase);
+    if (UseDedicatedPool) {
+      // We do not want this happen in general.
+      DP("Allocated %zu bytes from L0 for %s pool\n", Size,
+         UseScratchPool ? "scratch" : "zero-initialized");
+    }
+  }
+  return Mem;
+}
+
+/// Deallocate memory
+int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
+  MemAllocInfoTy Info;
+  if (!AllocInfo.remove(Ptr, &Info)) {
+    DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
+       DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  if (Info.InPool) {
+    size_t DeallocSize = 0;
+    if (Pools.count(Info.Kind) > 0)
+      DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+    if (DeallocSize == 0) {
+      // Try reduction scratch pool
+      DeallocSize = ReductionPool->dealloc(Info.Base);
+      // Try reduction counter pool
+      if (DeallocSize == 0)
+        DeallocSize = CounterPool->dealloc(Info.Base);
+      if (DeallocSize == 0) {
+        DP("Error: Cannot return memory " DPxMOD " to pool\n", DPxPTR(Ptr));
+        return OFFLOAD_FAIL;
+      }
+    }
+    log(0, DeallocSize, Info.Kind, true /* Pool */);
+    return OFFLOAD_SUCCESS;
+  }
+  if (!Info.Base) {
+    DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  CALL_ZE_RET_FAIL(zeMemFree, L0Context->getZeContext(), Info.Base);
+  log(0, Info.Size, Info.Kind);
+
+  DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
+     DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
+                                       size_t Size) {
+  return Device->enqueueMemCopy(Dst, Src, Size);
+}
+
+void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
+                              size_t ActiveSize) {
+  void *Mem = nullptr;
+  ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+                                        nullptr, 0, 0};
+  ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                    nullptr, 0};
+
+  // Use relaxed allocation limit if driver supports
+  ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{
+      ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr,
+      ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE};
+  if (Size > MaxAllocSize && SupportsLargeMem) {
+    DeviceDesc.pNext = &RelaxedDesc;
+    HostDesc.pNext = &RelaxedDesc;
+  }
+
+  auto zeDevice = Device ? Device->getZeDevice() : 0;
+  auto zeContext = L0Context->getZeContext();
+  bool makeResident = false;
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    makeResident = true;
+    CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+                     zeDevice, &Mem);
+    DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_HOST:
+    CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+    DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_SHARED:
+    CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+                     Align, zeDevice, &Mem);
+    DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  default:
+    assert(0 && "Invalid target data allocation kind");
+  }
+
+  size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+  log(LoggedSize, LoggedSize, Kind);
+  if (makeResident) {
+    assert(Device &&
+           "Device is not set for memory allocation. Is this a Device Pool?");
+    if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+      Mem = nullptr;
+  }
+  return Mem;
+}
+
+ze_event_handle_t EventPoolTy::getEvent() {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+
+  if (Events.empty()) {
+    // Need to create a new L0 pool
+    ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0};
+    Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
+    Desc.count = PoolSize;
+    ze_event_pool_handle_t Pool;
+    CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+    Pools.push_back(Pool);
+
+    // Create events
+    ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
+    EventDesc.wait = 0;
+    EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    for (uint32_t I = 0; I < PoolSize; I++) {
+      EventDesc.index = I;
+      ze_event_handle_t Event;
+      CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+      Events.push_back(Event);
+    }
+  }
+
+  auto Ret = Events.back();
+  Events.pop_back();
+
+  return Ret;
+}
+
+/// Return an event to the pool
+void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+  CALL_ZE_RET_VOID(zeEventHostReset, Event);
+  Events.push_back(Event);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
new file mode 100644
index 0000000000000..3acb2e78927e7
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -0,0 +1,371 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget.h"
+
+#include "L0Defs.h"
+#include "L0Options.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
+                                  int32_t CCSID) const {
+  if (ExplicitRootDevices.empty())
+    return false;
+  for (const auto &RootDev : ExplicitRootDevices) {
+    const auto ErootID = std::get<1>(RootDev);
+    if (ErootID != -2 && RootID != ErootID)
+      continue;
+    const auto EsubID = std::get<2>(RootDev);
+    if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
+      continue;
+    const auto ECCSID = std::get<3>(RootDev);
+    if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
+      continue;
+    // Check if isDiscard
+    if (!std::get<0>(RootDev))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+/// Read environment variables
+void L0OptionsTy::processEnvironmentVars() {
+  // Compilation options for IGC
+  UserCompilationOptions +=
+      std::string(" ") +
+      StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
+
+  // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
+  const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
+  if (DeviceSelectorVar.isPresent()) {
+    std::string EnvStr(std::move(DeviceSelectorVar.get()));
+    uint32_t numDiscard = 0;
+    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
+                   [](unsigned char C) { return std::tolower(C); });
+
+    std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+    for (const auto &Term : Entries) {
+      bool isDiscard = false;
+      std::vector<std::string_view> Pair = tokenize(Term, ":", true);
+      if (Pair.empty()) {
+        FAILURE_MESSAGE(
+            "Incomplete selector! Pair and device must be specified.\n");
+      } else if (Pair.size() == 1) {
+        FAILURE_MESSAGE("Incomplete selector!  Try '%s:*'if all devices "
+                        "under the Pair was original intention.\n",
+                        Pair[0].data());
+      } else if (Pair.size() > 2) {
+        FAILURE_MESSAGE(
+            "Error parsing selector string \"%s\" Too many colons (:)\n",
+            Term.data());
+      }
+      if (!((Pair[0][0] == '*') ||
+            (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
+            (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+        break;
+      isDiscard = Pair[0][0] == '!';
+      if (isDiscard)
+        numDiscard++;
+      else if (numDiscard > 0)
+        FAILURE_MESSAGE("All negative(discarding) filters must appear after "
+                        "all positive(accepting) filters!");
+
+      std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+      for (const auto &TargetStr : Targets) {
+        bool HasDeviceWildCard = false;
+        bool HasSubDeviceWildCard = false;
+        bool DeviceNum = false;
+        std::vector<std::string_view> DeviceSubTuple =
+            tokenize(TargetStr, ".", true);
+        int32_t RootD[3] = {-1, -1, -1};
+        if (DeviceSubTuple.empty()) {
+          FAILURE_MESSAGE(
+              "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
+              "specified.");
+        }
+
+        std::string_view TopDeviceStr = DeviceSubTuple[0];
+        static const std::array<std::string, 7> DeviceStr = {
+            "host", "cpu", "gpu", "acc", "fpga", "*"};
+        auto It =
+            find_if(DeviceStr.begin(), DeviceStr.end(),
+                    [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
+        if (It != DeviceStr.end()) {
+          if (TopDeviceStr[0] == '*') {
+            HasDeviceWildCard = true;
+            RootD[0] = -2;
+          } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+            continue;
+        } else {
+          std::string TDS(TopDeviceStr);
+          if (!isDigits(TDS)) {
+            FAILURE_MESSAGE("error parsing device number: %s",
+                            DeviceSubTuple[0].data());
+          } else {
+            RootD[0] = std::stoi(TDS);
+            DeviceNum = true;
+          }
+        }
+        if (DeviceSubTuple.size() >= 2) {
+          if (!DeviceNum && !HasDeviceWildCard)
+            FAILURE_MESSAGE("sub-devices can only be requested when parent "
+                            "device is specified by number or wildcard, not a "
+                            "device type like \'gpu\'");
+          std::string_view SubDeviceStr = DeviceSubTuple[1];
+          if (SubDeviceStr[0] == '*') {
+            HasSubDeviceWildCard = true;
+            RootD[1] = -2;
+          } else {
+            if (HasDeviceWildCard) // subdevice is a number and device is a *
+              FAILURE_MESSAGE(
+                  "sub-device can't be requested by number if parent "
+                  "device is specified by a wildcard.");
+
+            std::string SDS(SubDeviceStr);
+            if (!isDigits(SDS)) {
+              FAILURE_MESSAGE("error parsing subdevice index: %s",
+                              DeviceSubTuple[1].data());
+            } else
+              RootD[1] = std::stoi(SDS);
+          }
+        }
+        if (DeviceSubTuple.size() == 3) {
+          std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+          if (SubSubDeviceStr[0] == '*') {
+            RootD[2] = -2;
+          } else {
+            if (HasSubDeviceWildCard)
+              FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
+                              "sub-device before is specified by a wildcard.");
+            std::string SSDS(SubSubDeviceStr);
+            if (!isDigits(SSDS)) {
+              FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
+                              DeviceSubTuple[2].data());
+            } else
+              RootD[2] = std::stoi(SSDS);
+          }
+        } else if (DeviceSubTuple.size() > 3) {
+          FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
+                          "supported at this time ",
+                          TargetStr.data());
+        }
+        if (isDiscard)
+          ExplicitRootDevices.insert(
+              ExplicitRootDevices.begin(),
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+        else
+          ExplicitRootDevices.push_back(
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+      }
+    }
+  }
+
+  DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
+     ExplicitRootDevices.size());
+  DP("  (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
+     "-1(ignore)\n");
+  for (auto &T : ExplicitRootDevices) {
+    DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
+       std::get<2>(T), std::get<3>(T));
+    (void)T; // silence warning
+  }
+
+  // Memory pool
+  // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
+  //  <Option>       := 0 | <PoolInfoList>
+  //  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]
+  //  <PoolInfo>     := <MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]
+  //  <MemType>      := all | device | host | shared
+  //  <AllocMax>     := non-negative integer or empty, max allocation size in
+  //                    MB (default: 1)
+  //  <Capacity>     := positive integer or empty, number of allocations from
+  //                    a single block (default: 4)
+  //  <PoolSize>     := positive integer or empty, max pool size in MB
+  //                    (default: 256)
+  const StringEnvar MemoryPoolVar("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL", "");
+  if (MemoryPoolVar.isPresent()) {
+    if (MemoryPoolVar.get() == "0") {
+      Flags.UseMemoryPool = 0;
+      MemPoolInfo.clear();
+    } else {
+      std::istringstream Str(MemoryPoolVar.get());
+      int32_t MemType = -1;
+      int32_t Offset = 0;
+      int32_t Valid = 1;
+      const std::array<int32_t, 3> DefaultValue{1, 4, 256};
+      const int32_t AllMemType = INT32_MAX;
+      std::array<int32_t, 3> AllInfo{1, 4, 256};
+      std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+      for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
+        if (Token == "device") {
+          MemType = TARGET_ALLOC_DEVICE;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "host") {
+          MemType = TARGET_ALLOC_HOST;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "shared") {
+          MemType = TARGET_ALLOC_SHARED;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "all") {
+          MemType = AllMemType;
+          Offset = 0;
+          Valid = 2;
+        } else if (Offset < 3 && MemType >= 0) {
+          int32_t Num = std::atoi(Token.c_str());
+          bool ValidNum = (Num >= 0 && Offset == 0) || (Num > 0 && Offset > 0);
+          if (ValidNum && MemType == AllMemType)
+            AllInfo[Offset++] = Num;
+          else if (ValidNum)
+            PoolInfo[MemType][Offset++] = Num;
+          else if (Token.size() == 0)
+            Offset++;
+          else
+            Valid = 0;
+        } else {
+          Valid = 0;
+        }
+      }
+      if (Valid > 0) {
+        if (Valid == 2) {
+          // "all" is specified -- ignore other inputs
+          if (AllInfo[0] > 0) {
+            MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+          } else {
+            MemPoolInfo.clear();
+          }
+        } else {
+          // Use user-specified configuration
+          for (auto &I : PoolInfo) {
+            if (I.second[0] > 0)
+              MemPoolInfo[I.first] = I.second;
+            else
+              MemPoolInfo.erase(I.first);
+          }
+        }
+      } else {
+        DP("Ignoring incorrect memory pool configuration "
+           "LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=%s\n",
+           MemoryPoolVar.get().c_str());
+        DP("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>\n");
+        DP("  <Option>       := 0 | <PoolInfoList>\n");
+        DP("  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]\n");
+        DP("  <PoolInfo>     := "
+           "<MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]\n");
+        DP("  <MemType>      := all | device | host | shared\n");
+        DP("  <AllocMax>     := non-negative integer or empty, "
+           "max allocation size in MB (default: 1)\n");
+        DP("  <Capacity>     := positive integer or empty, "
+           "number of allocations from a single block (default: 4)\n");
+        DP("  <PoolSize>     := positive integer or empty, "
+           "max pool size in MB (default: 256)\n");
+      }
+    }
+  }
+
+  if (StringEnvar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS").isPresent()) {
+    // To match SYCL RT behavior, we just need to check whether
+    // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+    // does not matter.
+    CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE=<SizeInKB>
+  const Envar<size_t> StagingBufferSizeVar(
+      "LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE");
+  if (StagingBufferSizeVar.isPresent()) {
+    size_t SizeInKB = StagingBufferSizeVar;
+    if (SizeInKB > (16 << 10)) {
+      SizeInKB = (16 << 10);
+      DP("Staging buffer size is capped at %zu KB\n", SizeInKB);
+    }
+    StagingBufferSize = SizeInKB << 10;
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE=<Fmt>
+  // <Fmt> := sync | async | async_ordered
+  // sync: perform synchronization after each command
+  // async: perform synchronization when it is required
+  // async_ordered: same as "async", but command is ordered
+  // This option is ignored unless IMM is fully enabled on compute and copy.
+  // On Intel PVC GPU, when used with immediate command lists over Level Zero
+  // backend, a target region may involve multiple command submissions to the
+  // L0 copy queue and compute queue. L0 events are used for each submission
+  // (data transfer of a single item or kernel execution). When "async" is
+  // specified, a) each data transfer to device is submitted with an event.
+  // b) The kernel is submitted next with a dependence on all the previous
+  // data transfer events. The kernel also has an event associated with it.
+  // c) The data transfer from device will be submitted with a dependence on
+  // the kernel event. d) Finally wait on the host for all the events
+  // associated with the data transfer from device.
+  // The env-var also affects any "target update" constructs as well.
+  // The env-var only affects the L0 copy/compute commands issued from a
+  // single target construct execution, not across multiple invocations.
+  const StringEnvar CommandModeVar("LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE");
+  if (CommandModeVar.isPresent()) {
+    if (match(CommandModeVar, "sync"))
+      CommandMode = CommandModeTy::Sync;
+    else if (match(CommandModeVar, "async"))
+      CommandMode = CommandModeTy::Async;
+    else if (match(CommandModeVar, "async_ordered"))
+      CommandMode = CommandModeTy::AsyncOrdered;
+    else
+      INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
+                     CommandModeVar.get().c_str());
+  }
+}
+/// Parse String  and split into tokens of string_views based on the
+/// Delim character.
+std::vector<std::string_view>
+L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
+                      bool ProhibitEmptyTokens) {
+  std::vector<std::string_view> Tokens;
+  size_t Pos = 0;
+  size_t LastPos = 0;
+  while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
+    std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
+
+    if (!Tok.empty()) {
+      Tokens.push_back(Tok);
+    } else if (ProhibitEmptyTokens) {
+      FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
+                      "before '%s'delimiter is not allowed.",
+                      Delim.c_str());
+    }
+    // move the search starting index
+    LastPos = Pos + 1;
+  }
+
+  // Add remainder if any
+  if (LastPos < Filter.size()) {
+    std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
+    Tokens.push_back(Tok);
+  } else if ((LastPos != 0) && ProhibitEmptyTokens) {
+    // if delimiter is the last sybmol in the string.
+    FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
+                    "'%s' delimiter is not allowed.",
+                    Delim.c_str());
+  }
+  return Tokens;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
new file mode 100644
index 0000000000000..51d6595560484
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -0,0 +1,285 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/zes_api.h>
+
+#include "L0Device.h"
+#include "L0Interop.h"
+#include "L0Kernel.h"
+#include "L0Plugin.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+using namespace llvm::omp::target;
+using namespace error;
+
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+// Common data across all possible plugin instantiations
+L0OptionsTy LevelZeroPluginTy::Options;
+
+int32_t LevelZeroPluginTy::findDevices() {
+  CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+  uint32_t NumDrivers = 0;
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+  if (NumDrivers == 0) {
+    DP("Cannot find any drivers.\n");
+    return 0;
+  }
+  const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
+
+  // We expect multiple drivers on Windows to support different device types,
+  // so we need to maintain multiple drivers and contexts in general.
+  llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+
+  struct RootInfoTy {
+    uint32_t OrderId;
+    ze_device_handle_t zeDevice;
+    L0ContextTy *Driver;
+    bool IsDiscrete;
+  };
+  llvm::SmallVector<RootInfoTy> RootDevices;
+
+  uint32_t OrderId = 0;
+  for (uint32_t DriverId = 0; DriverId < NumDrivers; DriverId++) {
+    const auto &Driver = FoundDrivers[DriverId];
+    uint32_t DeviceCount = 0;
+    ze_result_t RC;
+    CALL_ZE(RC, zeDeviceGet, Driver, &DeviceCount, nullptr);
+    if (RC != ZE_RESULT_SUCCESS || DeviceCount == 0) {
+      DP("Cannot find any devices from driver " DPxMOD ".\n", DPxPTR(Driver));
+      continue;
+    }
+    // We have a driver that supports at least one device
+    ContextList.emplace_back(*this, Driver, DriverId);
+    auto &DrvInfo = ContextList.back();
+    llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
+    CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+
+    for (auto &zeDevice : FoundDevices)
+      RootDevices.push_back(
+          {OrderId++, zeDevice, &DrvInfo, L0DeviceTy::isDiscrete(zeDevice)});
+  }
+
+  // move discrete devices to the front
+  std::sort(RootDevices.begin(), RootDevices.end(),
+            [](const RootInfoTy &A, const RootInfoTy &B) {
+              // if both are discrete, order by OrderId
+              // if both are not discrete, order by OrderId
+              // Otherwise, discrete goes first
+
+              if (A.IsDiscrete && B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              if (!A.IsDiscrete && !B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              return A.IsDiscrete;
+            });
+
+  struct DeviceInfoTy {
+    L0DeviceIdTy Id;
+    L0ContextTy *Driver;
+    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+  };
+
+  llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
+
+  // helper lambdas
+  auto addDevice = [ExplicitMode,
+                    &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+                                   int32_t SubId = -1, int32_t CCSId = -1) {
+    if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
+      DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
+    }
+  };
+  for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
+    const auto zeDevice = RootDevices[RootId].zeDevice;
+    auto *RootDriver = RootDevices[RootId].Driver;
+    addDevice(zeDevice, RootDriver, RootId);
+  }
+  NumDevices = DevicesToAdd.size();
+  auto DeviceId = 0;
+  for (auto &DeviceInfo : DevicesToAdd) {
+    auto RootId = DeviceInfo.Id.RootId;
+    auto SubId = DeviceInfo.Id.SubId;
+    auto CCSId = DeviceInfo.Id.CCSId;
+    auto zeDevice = DeviceInfo.Id.zeId;
+    auto *Driver = DeviceInfo.Driver;
+
+    std::string IdStr = std::to_string(RootId) +
+                        (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+                        (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+    L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
+                                       zeDevice, *Driver, std::move(IdStr),
+                                       CCSId < 0 ? 0 : CCSId /* ComputeIndex */
+                                       ));
+    DeviceId++;
+  }
+
+  DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
+     getNumRootDevices(), NumDevices);
+  DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
+  for (auto &l0Device : L0Devices) {
+    DP("-- %s\n", l0Device->getZeIdCStr());
+    (void)l0Device; // silence warning
+  }
+
+  if (getDebugLevel() > 0) {
+    DP("Root Device Information\n");
+    for (uint32_t I = 0; I < getNumRootDevices(); I++) {
+      auto &l0Device = getDeviceFromId(I);
+      l0Device.reportDeviceInfo();
+    }
+  }
+
+  return getNumRootDevices();
+}
+
+/// Clean-up routine to be invoked by the destructor or
+/// LevelZeroPluginTy::deinit.
+void LevelZeroPluginTy::closeRTL() {
+
+  ContextTLSTable.clear();
+  DeviceTLSTable.clear();
+  ThreadTLSTable.clear();
+  ContextList.clear();
+
+  DP("Plugin closed successfully\n");
+}
+
+Expected<int32_t> LevelZeroPluginTy::initImpl() {
+  DP("Level0 NG plugin initialization\n");
+  // process options before anything else
+  Options.init();
+  return findDevices();
+}
+
+Error LevelZeroPluginTy::deinitImpl() {
+  DP("Deinit Level0 plugin!\n");
+  closeRTL();
+  return Plugin::success();
+}
+
+GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
+                                                 int32_t DeviceId,
+                                                 int32_t NumDevices) {
+  return &getDeviceFromId(DeviceId);
+}
+
+GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
+  return new L0GlobalHandlerTy();
+}
+
+uint16_t LevelZeroPluginTy::getMagicElfBits() const {
+  // TODO: We need to register a real ELF machine type
+  return 0x8086;
+}
+
+Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
+  return Triple::spirv64;
+}
+
+const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
+
+Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
+  return Plugin::success();
+}
+
+Expected<bool> LevelZeroPluginTy::isELFCompatible(uint32_t DeviceId,
+                                                  StringRef Image) const {
+  uint64_t MajorVer, MinorVer;
+  return isValidOneOmpImage(Image, MajorVer, MinorVer);
+}
+
+Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  // L0 object
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  auto &l0Device = getDeviceFromId(device_id);
+
+  // We can synchronize both L0 & SYCL objects with the same ze command
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with ImmCmdList barrier\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    auto Event = l0Device.getEvent();
+
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, Event, 0,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
+    l0Device.releaseEvent(Event);
+  } else {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with queue synchronize\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+  }
+
+  return Plugin::success();
+}
+
+Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  if (Interop->attrs.inorder)
+    return Plugin::success();
+
+  auto &l0Device = getDeviceFromId(device_id);
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::async_barrier: Appending ImmCmdList barrier "
+       "to " DPxMOD "\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
+                      nullptr);
+  } else {
+    DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
+       "\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    ze_command_list_handle_t CmdList = l0Device.getCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_level_zero() {
+  return new llvm::omp::target::plugin::LevelZeroPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
new file mode 100644
index 0000000000000..33c19b0e7c50d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -0,0 +1,625 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // !_WIN32
+
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                                     DeviceImageTy &Image,
+                                                     GlobalTy &DeviceGlobal) {
+  const char *GlobalName = DeviceGlobal.getName().data();
+
+  L0DeviceTy &l0Device = static_cast<L0DeviceTy &>(Device);
+  const L0ProgramTy *Program =
+      l0Device.getProgramFromImage(Image.getTgtImage());
+  void *Addr = Program->getOffloadVarDeviceAddr(GlobalName);
+
+  // Save the pointer to the symbol allowing nullptr.
+  DeviceGlobal.setPtr(Addr);
+
+  if (Addr == nullptr)
+    return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
+                         GlobalName);
+
+  return Plugin::success();
+}
+
+inline L0DeviceTy &L0ProgramTy::getL0Device() const {
+  return L0DeviceTy::makeL0Device(getDevice());
+}
+
+L0ProgramTy::~L0ProgramTy() {
+  for (auto *Kernel : Kernels) {
+    // We need explicit destructor and deallocate calls to release the kernels
+    // created by `GenericDeviceTy::constructKernel()`.
+    Kernel->~L0KernelTy();
+    getL0Device().getPlugin().free(Kernel);
+  }
+  for (auto Module : Modules) {
+    CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+  }
+}
+
+void L0ProgramTy::setLibModule() {
+#if _WIN32
+  return;
+#else
+  const auto *Image = getTgtImage();
+  const size_t NumEntries =
+      static_cast<size_t>(Image->EntriesEnd - Image->EntriesBegin);
+  for (size_t I = 0; I < NumEntries; I++) {
+    const auto &Entry = Image->EntriesBegin[I];
+    // Image contains a kernel, so it is not compiled as a library module
+    if (Entry.SymbolName && Entry.Size == 0)
+      return;
+  }
+  // Check if the image belongs to a dynamic library
+  Dl_info DLI{nullptr};
+  if (dladdr(Image->ImageStart, &DLI) && DLI.dli_fname) {
+    std::vector<uint8_t> FileBin;
+    auto Size = readFile(DLI.dli_fname, FileBin);
+    if (Size) {
+      auto MB = MemoryBuffer::getMemBuffer(
+          StringRef(reinterpret_cast<const char *>(FileBin.data()), Size),
+          /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      auto ELF = ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+      if (ELF) {
+        if (auto *Obj = dyn_cast<ELF64LEObjectFile>((*ELF).get())) {
+          const auto Header = Obj->getELFFile().getHeader();
+          if (Header.e_type == ELF::ET_DYN) {
+            DP("Processing current image as library\n");
+            IsLibModule = true;
+          }
+        }
+      }
+    }
+  }
+#endif // _WIN32
+}
+
+int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+                               const std::string &CommonBuildOptions,
+                               ze_module_format_t Format) {
+  const ze_module_constants_t SpecConstants =
+      LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
+  auto &l0Device = getL0Device();
+  std::string BuildOptions(CommonBuildOptions);
+
+  // Add required flag to enable dynamic linking.
+  if (IsLibModule)
+    BuildOptions += " -library-compilation ";
+
+  ze_module_desc_t ModuleDesc{};
+  ModuleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  ModuleDesc.pNext = nullptr;
+  ModuleDesc.format = Format;
+  ze_module_handle_t Module = nullptr;
+  ze_module_build_log_handle_t BuildLog = nullptr;
+  ze_result_t RC;
+
+  // Build a single module from a single image
+  ModuleDesc.inputSize = Size;
+  ModuleDesc.pInputModule = Image;
+  ModuleDesc.pBuildFlags = BuildOptions.c_str();
+  ModuleDesc.pConstants = &SpecConstants;
+  CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
+             l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
+
+  const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
+
+  if (BuildFailed) {
+    if (IsLibModule)
+      return OFFLOAD_SUCCESS;
+    return OFFLOAD_FAIL;
+  } else {
+    // Check if module link is required. We do not need this check for
+    // library module
+    if (!RequiresModuleLink && !IsLibModule) {
+      ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                           nullptr, 0};
+      CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+      RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+    }
+    // For now, assume the first module contains libraries, globals.
+    if (Modules.empty())
+      GlobalModule = Module;
+    Modules.push_back(Module);
+    l0Device.addGlobalModule(Module);
+    return OFFLOAD_SUCCESS;
+  }
+}
+
+int32_t L0ProgramTy::linkModules() {
+  auto &l0Device = getL0Device();
+  if (!RequiresModuleLink) {
+    DP("Module link is not required\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (Modules.empty()) {
+    DP("Invalid number of modules when linking modules\n");
+    return OFFLOAD_FAIL;
+  }
+
+  ze_result_t RC;
+  ze_module_build_log_handle_t LinkLog = nullptr;
+  CALL_ZE_RC(RC, zeModuleDynamicLink,
+             static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+             l0Device.getGlobalModulesArray(), &LinkLog);
+  const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
+  return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+}
+
+size_t L0ProgramTy::readFile(const char *FileName,
+                             std::vector<uint8_t> &OutFile) const {
+  std::ifstream IFS(FileName, std::ios::binary);
+  if (!IFS.good())
+    return 0;
+  IFS.seekg(0, IFS.end);
+  auto FileSize = static_cast<size_t>(IFS.tellg());
+  OutFile.resize(FileSize);
+  IFS.seekg(0);
+  if (!IFS.read(reinterpret_cast<char *>(OutFile.data()), FileSize)) {
+    OutFile.clear();
+    return 0;
+  }
+  return FileSize;
+}
+
+/// Read SPV from file name
+int32_t L0ProgramTy::readSPVFile(const char *FileName,
+                                 std::vector<uint8_t> &OutSPV) const {
+  // Resolve full path using the location of the plugin
+  std::string FullPath;
+#ifdef _WIN32
+  char RTLPath[_MAX_PATH];
+  HMODULE RTLModule = nullptr;
+  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                          (LPCSTR)&__tgt_target_data_begin_nowait,
+                          &RTLModule)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLPath;
+#else  // _WIN32
+  Dl_info RTLInfo;
+  if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLInfo.dli_fname;
+#endif // _WIN32
+  const size_t PathSep = FullPath.find_last_of("/\\");
+  FullPath.replace(PathSep + 1, std::string::npos, FileName);
+  // Read from the full path
+  if (!readFile(FullPath.c_str(), OutSPV)) {
+    DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                                   std::string &Options) const {
+  // Options that need to be replaced with backend-specific options
+  static const struct {
+    std::string Option;
+    std::string BackendOption;
+  } OptionTranslationTable[] = {
+      {"-ftarget-compile-fast",
+       "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"},
+      {"-foffload-fp32-prec-div", "-ze-fp32-correctly-rounded-divide-sqrt"},
+      {"-foffload-fp32-prec-sqrt", "-ze-fp32-correctly-rounded-divide-sqrt"},
+  };
+
+  for (const auto &OptPair : OptionTranslationTable) {
+    const size_t Pos = Options.find(OptPair.Option);
+    if (Pos != std::string::npos) {
+      Options.replace(Pos, OptPair.Option.length(), OptPair.BackendOption);
+    }
+  }
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  const auto MB = MemoryBuffer::getMemBuffer(Image,
+                                             /*BufferName=*/"",
+                                             /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  if (!ExpectedNewE) {
+    DP("Warning: unable to get ELF handle!\n");
+    return false;
+  }
+  bool Res = false;
+  auto processObjF = [&](const auto ELFObjF) {
+    if (!ELFObjF) {
+      DP("Warning: Unexpected ELF type!\n");
+      return false;
+    }
+    const auto &ELFF = ELFObjF->getELFFile();
+    auto Sections = ELFF.sections();
+    if (!Sections) {
+      DP("Warning: unable to get ELF sections!\n");
+      return false;
+    }
+    bool SeenOffloadSection = false;
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : ELFF.notes(Sec, Err)) {
+        if (Err) {
+          DP("Warning: unable to get ELF notes handle!\n");
+          return false;
+        }
+        if (Note.getName() != "INTELONEOMPOFFLOAD")
+          continue;
+        SeenOffloadSection = true;
+        if (Note.getType() != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+          continue;
+
+        std::string DescStr(std::move(Note.getDescAsStringRef(4).str()));
+        const auto DelimPos = DescStr.find('.');
+        if (DelimPos == std::string::npos) {
+          // The version has to look like "Major#.Minor#".
+          DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n",
+             DescStr.c_str());
+          return false;
+        }
+        const std::string MajorVerStr = DescStr.substr(0, DelimPos);
+        DescStr.erase(0, DelimPos + 1);
+        MajorVer = std::stoull(MajorVerStr);
+        MinorVer = std::stoull(DescStr);
+        return (MajorVer == 1 && MinorVer == 0);
+      }
+    }
+    return SeenOffloadSection;
+  };
+  if (const auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else if (const auto *O =
+                 dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+  return Res;
+}
+
+static StringRef getImageStringRef(const __tgt_device_image *Image) {
+  const char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+  const char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+  const size_t ImgSize = ImgEnd - ImgBegin;
+  return StringRef(ImgBegin, ImgSize);
+}
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  return isValidOneOmpImage(getImageStringRef(Image), MajorVer, MinorVer);
+}
+
+int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
+  auto &l0Device = getL0Device();
+  auto *Image = getTgtImage();
+  if (identify_magic(getImageStringRef(Image)) == file_magic::spirv_object) {
+    // Handle legacy plain SPIR-V image.
+    uint8_t *ImgBegin = reinterpret_cast<uint8_t *>(Image->ImageStart);
+    uint8_t *ImgEnd = reinterpret_cast<uint8_t *>(Image->ImageEnd);
+    size_t ImgSize = ImgEnd - ImgBegin;
+    return addModule(ImgSize, ImgBegin, BuildOptions,
+                     ZE_MODULE_FORMAT_IL_SPIRV);
+  }
+
+  uint64_t MajorVer, MinorVer;
+  if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+    DP("Warning: image is not a valid oneAPI OpenMP image.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  setLibModule();
+
+  // Iterate over the images and pick the first one that fits.
+  uint64_t ImageCount = 0;
+  struct V1ImageInfo {
+    // 0 - native, 1 - SPIR-V
+    uint64_t Format = (std::numeric_limits<uint64_t>::max)();
+    std::string CompileOpts;
+    std::string LinkOpts;
+    // We may have multiple sections created from split-kernel mode
+    std::vector<const uint8_t *> PartBegin;
+    std::vector<uint64_t> PartSize;
+
+    V1ImageInfo(uint64_t Format, std::string CompileOpts, std::string LinkOpts)
+        : Format(Format), CompileOpts(std::move(CompileOpts)),
+          LinkOpts(std::move(LinkOpts)) {}
+  };
+  std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+  auto MB = MemoryBuffer::getMemBuffer(getImageStringRef(Image),
+                                       /*BufferName=*/"",
+                                       /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  assert(ExpectedNewE &&
+         "isValidOneOmpImage() returns true for invalid ELF image");
+  auto processELF = [&](auto *EObj) {
+    assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
+    assert(MajorVer == 1 && MinorVer == 0 &&
+           "FIXME: update image processing for new oneAPI OpenMP version.");
+    const auto &E = EObj->getELFFile();
+    // Collect auxiliary information.
+    uint64_t MaxImageIdx = 0;
+
+    auto Sections = E.sections();
+    assert(Sections && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid sections.");
+
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : E.notes(Sec, Err)) {
+        assert(!Err && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid notes.");
+        if (Note.getName().str() != "INTELONEOMPOFFLOAD")
+          continue;
+
+        const uint64_t Type = Note.getType();
+        std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+        switch (Type) {
+        default:
+          DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+          ImageCount = std::stoull(DescStr);
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
+          std::vector<std::string> Parts;
+          do {
+            const auto DelimPos = DescStr.find('\0');
+            if (DelimPos == std::string::npos) {
+              Parts.push_back(std::move(DescStr));
+              break;
+            }
+            Parts.push_back(DescStr.substr(0, DelimPos));
+            DescStr.erase(0, DelimPos + 1);
+          } while (Parts.size() < 4);
+
+          // Ignore records with less than 4 strings.
+          if (Parts.size() != 4) {
+            DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+               "record is ignored.\n");
+            continue;
+          }
+
+          const uint64_t Idx = std::stoull(Parts[0]);
+          MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+          if (AuxInfo.find(Idx) != AuxInfo.end()) {
+            DP("Warning: duplicate auxiliary information for image %" PRIu64
+               " is ignored.\n",
+               Idx);
+            continue;
+          }
+          AuxInfo.emplace(
+              std::piecewise_construct, std::forward_as_tuple(Idx),
+              std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+          // Image pointer and size
+          // will be initialized later.
+        }
+        }
+      }
+    }
+
+    if (MaxImageIdx >= ImageCount)
+      DP("Warning: invalid image index found in auxiliary information.\n");
+
+    for (auto Sec : *Sections) {
+      const char *Prefix = "__openmp_offload_spirv_";
+      auto ExpectedSectionName = E.getSectionName(Sec);
+      assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
+                                    "image with invalid section names");
+      std::string SectionName = (*ExpectedSectionName).str();
+      if (SectionName.find(Prefix) != 0)
+        continue;
+      SectionName.erase(0, std::strlen(Prefix));
+
+      // Expected section name in split-kernel mode:
+      // __openmp_offload_spirv_<image_id>_<part_id>
+      auto PartIdLoc = SectionName.find("_");
+      if (PartIdLoc != std::string::npos) {
+        DP("Found a split section in the image\n");
+        // It seems that we do not need part ID as long as they are ordered
+        // in the image and we keep the ordering in the runtime.
+        SectionName.erase(PartIdLoc);
+      } else {
+        DP("Found a single section in the image\n");
+      }
+
+      uint64_t Idx = std::stoull(SectionName);
+      if (Idx >= ImageCount) {
+        DP("Warning: ignoring image section (index %" PRIu64
+           " is out of range).\n",
+           Idx);
+        continue;
+      }
+
+      auto AuxInfoIt = AuxInfo.find(Idx);
+      if (AuxInfoIt == AuxInfo.end()) {
+        DP("Warning: ignoring image section (no aux info).\n");
+        continue;
+      }
+      auto Contents = E.getSectionContents(Sec);
+      assert(Contents);
+      AuxInfoIt->second.PartBegin.push_back((*Contents).data());
+      AuxInfoIt->second.PartSize.push_back(Sec.sh_size);
+    }
+  };
+
+  if (auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else if (auto *O = dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+
+  for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+    const auto It = AuxInfo.find(Idx);
+    if (It == AuxInfo.end()) {
+      DP("Warning: image %" PRIu64
+         " without auxiliary information is ingored.\n",
+         Idx);
+      continue;
+    }
+
+    const auto NumParts = It->second.PartBegin.size();
+    // Split-kernel is not supported in SPIRV format
+    if (NumParts > 1 && It->second.Format != 0) {
+      DP("Warning: split-kernel images are not supported in SPIRV format\n");
+      continue;
+    }
+
+    // Skip unknown image format
+    if (It->second.Format != 0 && It->second.Format != 1) {
+      DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+      continue;
+    }
+
+    const bool IsBinary = (It->second.Format == 0);
+    const auto ModuleFormat =
+        IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
+    std::string Options = BuildOptions;
+    {
+      Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
+      replaceDriverOptsWithBackendOpts(l0Device, Options);
+    }
+
+    for (size_t I = 0; I < NumParts; I++) {
+      const unsigned char *ImgBegin =
+          reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
+      size_t ImgSize = It->second.PartSize[I];
+
+      auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
+
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Error: failed to create program from %s "
+           "(%" PRIu64 "-%zu).\n",
+           IsBinary ? "Binary" : "SPIR-V", Idx, I);
+        return OFFLOAD_FAIL;
+      }
+    }
+
+    DP("Created module from image #%" PRIu64 ".\n", Idx);
+    BuildOptions = std::move(Options);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  return OFFLOAD_FAIL;
+}
+
+void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+  DP("Looking up OpenMP global variable '%s'.\n", CName);
+
+  if (!GlobalModule || !CName)
+    return nullptr;
+
+  std::string Name(CName);
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  for (auto Module : Modules) {
+    CALL_ZE(RC, zeModuleGetGlobalPointer, Module, Name.c_str(), &SizeDummy,
+            &DevicePtr);
+    if (RC == ZE_RESULT_SUCCESS && DevicePtr)
+      return DevicePtr;
+  }
+  DP("Warning: global variable '%s' was not found in the device.\n",
+     Name.c_str());
+  return nullptr;
+}
+
+int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+                                        void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot read from device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
+}
+
+int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+                                         const void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot write to device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
+}
+
+int32_t L0ProgramTy::loadModuleKernels() {
+  // We need to build kernels here before filling the offload entries since we
+  // don't know which module contains a specific kernel with a name.
+  std::unordered_map<std::string, ze_kernel_handle_t> ModuleKernels;
+  for (auto Module : Modules) {
+    uint32_t Count = 0;
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+    if (Count == 0)
+      continue;
+
+    llvm::SmallVector<const char *> Names(Count);
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+
+    for (auto *Name : Names) {
+      KernelsToModuleMap.emplace(Name, Module);
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
new file mode 100644
index 0000000000000..3721d686393bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
@@ -0,0 +1,71 @@
+//===--- level_zero/src/OmpWrapper.cpp --------------------------- C++ -*-===//
+//
+// Implement wrapper for OpenMP compatibility through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "L0Defs.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(omp_get_max_teams, 0)
+DLWRAP_INTERNAL(omp_get_teams_thread_limit, 0)
+
+DLWRAP_FINALIZE()
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadOpenMP() {
+  static bool Loaded{false};
+  if (Loaded)
+    return true;
+
+  const char *OpenMPLibrary = "libomp.so";
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", OpenMPLibrary);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(OpenMPLibrary, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", OpenMPLibrary, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, OpenMPLibrary);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+int omp_get_max_teams() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_max_teams();
+}
+
+int omp_get_teams_thread_limit() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_teams_thread_limit();
+}

>From f8956cd31b8abdabe3714108489fd2795ffd6013 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:02:58 +0200
Subject: [PATCH 02/70] Update offload/CMakeLists.txt

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 8a704ab05eb53..3432ca3c29059 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -180,7 +180,7 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
         CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
   if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
     message(STATUS "Not building Level Zero plugin: it is only supported on "
-	           "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+	           "Linux/Windows x86_64 or ppc64le hosts")
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
   endif()
 endif()

>From d3fc4d70f7e62cf5e3993e3c431c1430a8ff2d22 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:03:12 +0200
Subject: [PATCH 03/70] Update
 offload/plugins-nextgen/level_zero/CMakeLists.txt

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/plugins-nextgen/level_zero/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index b9c8dd423c3ca..8e465d663655c 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-return()
+  return()
 endif()
 
 # Create the library and add the default arguments.

>From 4b383862881d6201e44885da90f30c48312ab8dd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:09:41 +0200
Subject: [PATCH 04/70] Update
 offload/plugins-nextgen/level_zero/include/L0Plugin.h

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/plugins-nextgen/level_zero/include/L0Plugin.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 4658c1cdab1df..de78ded59c2ce 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -45,7 +45,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   /// L0 plugin global options
   static L0OptionsTy Options;
 
-  /// Global mutex
   std::mutex GlobalMutex;
 
   /// Common pool of AsyncQueue

>From 6c1c820a923a3f017f4bc9cf054d9c6b39bb6f77 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 10:47:46 +0200
Subject: [PATCH 05/70] Replace pragma once

---
 offload/plugins-nextgen/level_zero/include/AsyncQueue.h | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Context.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Defs.h     | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Device.h   | 4 +++-
 offload/plugins-nextgen/level_zero/include/L0Interop.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Kernel.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Memory.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Options.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Plugin.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Program.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Trace.h    | 5 ++++-
 offload/plugins-nextgen/level_zero/include/TLS.h        | 5 ++++-
 12 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 105f68205e402..e26661a613772 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 
 #include <vector>
 
@@ -48,3 +49,5 @@ typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index b2b6def8101ca..69748a3e61d01 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
 
 #include "L0Memory.h"
 #include "PerThreadTable.h"
@@ -136,3 +137,5 @@ class L0ContextTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 81566f52a2aea..05c287f4da013 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -9,7 +9,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
 
 #include "PluginInterface.h"
 #include "Shared/Requirements.h"
@@ -71,3 +72,5 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
                        __func__);
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 6acfa7e0ee67d..e22cfd928c0af 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
 
 #include "llvm/ADT/SmallVector.h"
 
@@ -678,3 +679,4 @@ class L0DeviceTy final : public GenericDeviceTy {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
index 4b8b417f9b339..69a1a5f274068 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Interop.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
 
 namespace llvm::omp::target::plugin::L0Interop {
 
@@ -23,3 +24,5 @@ struct Property {
 };
 
 } // namespace llvm::omp::target::plugin::L0Interop
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index bc6fc54cdea08..eca416d6fa882 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
 
 #include "L0Defs.h"
 #include "L0Trace.h"
@@ -152,3 +153,5 @@ class L0KernelTy : public GenericKernelTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 50af80a19a93a..f5547201c994f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
 
 #include <cassert>
 #include <level_zero/ze_api.h>
@@ -572,3 +573,5 @@ class StagingBufferTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b3ecd25f56ddd..a501df693f311 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
 
 #include <level_zero/ze_api.h>
 
@@ -187,3 +188,5 @@ struct L0OptionsTy {
 }; // L0OptionsTy
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index de78ded59c2ce..9fbdafa288592 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
 
 #include "AsyncQueue.h"
 #include "L0Defs.h"
@@ -133,3 +134,5 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index a548b486f4642..d156cce268182 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
 
 #include "L0Kernel.h"
 
@@ -133,3 +134,5 @@ bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
 bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
                         uint64_t &MinorVer);
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index 2eeae81016dee..f8519bd44ae79 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 // clang-format off
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
 
 #include "Shared/Debug.h"
 #include "omptarget.h"
@@ -191,3 +192,5 @@ inline const char *getZeErrorName(int32_t Error) {
     return "ZE_RESULT_ERROR_UNKNOWN";
   }
 }
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
index 8a5f41312e129..46086ee4b6d19 100644
--- a/offload/plugins-nextgen/level_zero/include/TLS.h
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
 
 #include "AsyncQueue.h"
 #include "L0Memory.h"
@@ -84,3 +85,5 @@ struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H

>From fd91c47605a4ad06d6a5780d69e530995f4e2035 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:20:34 +0200
Subject: [PATCH 06/70] Address review comments

---
 .../level_zero/include/AsyncQueue.h           |  2 ++
 .../level_zero/include/L0Defs.h               |  3 +-
 .../level_zero/include/L0Memory.h             |  2 +-
 .../level_zero/include/L0Options.h            |  8 ++---
 .../level_zero/src/L0Device.cpp               |  6 ++--
 .../level_zero/src/L0Kernel.cpp               | 10 +++++-
 .../level_zero/src/L0Options.cpp              |  2 +-
 .../level_zero/src/L0Program.cpp              | 32 +++++++++----------
 8 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index e26661a613772..2d32f1767a7b6 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -13,6 +13,8 @@
 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 
+#include <list>
+#include <tuple>
 #include <vector>
 
 #include "L0Memory.h"
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 05c287f4da013..66d38cd7b9eb5 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -1,4 +1,5 @@
 //===--- Level Zero Target RTL Implementation -----------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -43,7 +44,7 @@ LIBOMP_DECL(double, omp_get_wtime(void));
 namespace llvm::omp::target::plugin {
 
 /// Default alignmnet for allocation
-constexpr size_t L0Alignment = 0;
+constexpr size_t L0DefaultAlignment = 0;
 /// Default staging buffer size for host to device copy (16KB)
 constexpr size_t L0StagingBufferSize = (1 << 14);
 /// Default staging buffer count
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index f5547201c994f..63115b1a3c529 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -506,7 +506,7 @@ class StagingBufferTy {
     void *Ret = nullptr;
     size_t AllocSize = Size * Count;
     CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
-                     L0Alignment, &Ret);
+                     L0DefaultAlignment, &Ret);
     Buffers.push_back(Ret);
     return Ret;
   }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index a501df693f311..7e64f71054569 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -141,9 +141,9 @@ struct L0OptionsTy {
 
   bool Init = false; // have the options already been processed
 
-  /// Read environment variables
   L0OptionsTy() {}
 
+  /// Read environment variables
   void processEnvironmentVars();
 
   void init() {
@@ -155,9 +155,9 @@ struct L0OptionsTy {
 
   /// Parse the string and split it into tokens of string_views based on the
   /// Delim character.
-  std::vector<std::string_view> tokenize(const std::string_view &Filter,
-                                         const std::string &Delim,
-                                         bool ProhibitEmptyTokens = false);
+  static std::vector<std::string_view>
+  tokenize(const std::string_view &Filter, const std::string &Delim,
+           bool ProhibitEmptyTokens = false);
 
   bool isDigits(const std::string_view &str) {
     if (str.size() == 0)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 0029d00a07685..2235741ea70a4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -64,15 +64,15 @@ constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0
 DeviceArchTy L0DeviceTy::computeArch() const {
   const auto PCIDeviceId = getPCIId();
   if (PCIDeviceId != 0) {
-    for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+    for (int ArchIndex = 0; ArchIndex < DeviceArchMapSize; ArchIndex++) {
       for (int i = 0;; i++) {
-        const auto Id = DeviceArchMap[arch].ids[i];
+        const auto Id = DeviceArchMap[ArchIndex].ids[i];
         if (Id == PCIIdTy::None)
           break;
 
         auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
         if (maskedId == Id)
-          return DeviceArchMap[arch].arch; // Exact match or prefix match
+          return DeviceArchMap[ArchIndex].arch; // Exact match or prefix match
       }
     }
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index d1cb0b7bd50bd..b0a13a07ab919 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,7 +163,9 @@ void L0KernelTy::decideKernelGroupArguments(
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
   bool UsedReductionSubscriptionRate = false;
   if (!MaxGroupCountForced) {
-    { GRPCounts[0] *= OptSubscRate; }
+    { 
+      GRPCounts[0] *= OptSubscRate; 
+    }
 
     size_t LoopTripcount = 0;
     if (LoopLevels) {
@@ -626,6 +628,12 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
     }
   } else {
     ze_event_handle_t Event = nullptr;
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, 0, nullptr);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, 0, nullptr);
     KernelLock.unlock();
     CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
     CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 3acb2e78927e7..cb3a23b3e8bd4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -99,7 +99,7 @@ void L0OptionsTy::processEnvironmentVars() {
 
         std::string_view TopDeviceStr = DeviceSubTuple[0];
         static const std::array<std::string, 7> DeviceStr = {
-            "host", "cpu", "gpu", "acc", "fpga", "*"};
+            "host", "cpu", "gpu", "acc", "*"};
         auto It =
             find_if(DeviceStr.begin(), DeviceStr.end(),
                     [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 33c19b0e7c50d..9828f379e681a 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -132,22 +132,22 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
     if (IsLibModule)
       return OFFLOAD_SUCCESS;
     return OFFLOAD_FAIL;
-  } else {
-    // Check if module link is required. We do not need this check for
-    // library module
-    if (!RequiresModuleLink && !IsLibModule) {
-      ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
-                                           nullptr, 0};
-      CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
-      RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
-    }
-    // For now, assume the first module contains libraries, globals.
-    if (Modules.empty())
-      GlobalModule = Module;
-    Modules.push_back(Module);
-    l0Device.addGlobalModule(Module);
-    return OFFLOAD_SUCCESS;
   }
+
+  // Check if module link is required. We do not need this check for
+  // library module
+  if (!RequiresModuleLink && !IsLibModule) {
+    ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                         nullptr, 0};
+    CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+    RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+  }
+  // For now, assume the first module contains libraries, globals.
+  if (Modules.empty())
+    GlobalModule = Module;
+  Modules.push_back(Module);
+  l0Device.addGlobalModule(Module);
+  return OFFLOAD_SUCCESS;
 }
 
 int32_t L0ProgramTy::linkModules() {
@@ -376,8 +376,6 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
          "isValidOneOmpImage() returns true for invalid ELF image");
   auto processELF = [&](auto *EObj) {
     assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
-    assert(MajorVer == 1 && MinorVer == 0 &&
-           "FIXME: update image processing for new oneAPI OpenMP version.");
     const auto &E = EObj->getELFFile();
     // Collect auxiliary information.
     uint64_t MaxImageIdx = 0;

>From 84665dc22f710e4401e65dea6d20e18128c94daa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:44:35 +0200
Subject: [PATCH 07/70] Fix makefile format

---
 .../plugins-nextgen/level_zero/CMakeLists.txt | 92 ++++++++++---------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 8e465d663655c..df38671c040ab 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -6,64 +6,68 @@ endif()
 add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
 
 set(LEVEL_ZERO_SRC_FILES
-        src/L0Context.cpp
-        src/L0Device.cpp
-        src/L0Kernel.cpp
-        src/L0Memory.cpp
-        src/L0Program.cpp
-        src/L0Plugin.cpp
-        src/L0Program.cpp
-        src/L0Options.cpp
+  src/L0Context.cpp
+  src/L0Device.cpp
+  src/L0Kernel.cpp
+  src/L0Memory.cpp
+  src/L0Program.cpp
+  src/L0Plugin.cpp
+  src/L0Program.cpp
+  src/L0Options.cpp
 )
 list(APPEND LEVEL_ZERO_SRC_FILES
-        src/OmpWrapper.cpp
+  src/OmpWrapper.cpp
 )
 
 target_sources(omptarget.rtl.level_zero PRIVATE
-   ${LEVEL_ZERO_SRC_FILES}
+  ${LEVEL_ZERO_SRC_FILES}
 )
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
-      ${CMAKE_CURRENT_SOURCE_DIR}/include
-      ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
-      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
-      ${LIBOMPTARGET_OMP_HEADER_DIR}
+  ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_OMP_HEADER_DIR}
 )
 
 if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
-message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+  message(STATUS "Building Level Zero NG plugin linked against level_zero library")
 
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  target_link_libraries(omptarget.rtl.level_zero PRIVATE
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
-elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
-  # Full path to the L0 library is recognized as a linker option, so we
-  # separate directory and file name
-  get_filename_component(LEVEL_ZERO_LIBRARY_PATH
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
-  get_filename_component(LEVEL_ZERO_LIBRARY_NAME
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-  target_link_libraries(omptarget.rtl.level_zero PRIVATE
-          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
-  target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
-  target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
-  libomptarget_add_resource_file(omptarget.rtl.level_zero)
+  if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE
+                        ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+  elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # Full path to the L0 library is recognized as a linker option, so we
+    # separate directory and file name
+    get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+    get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE
+                          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+    target_link_directories(omptarget.rtl.level_zero PRIVATE 
+                            ${LEVEL_ZERO_LIBRARY_PATH})
+    target_link_options(omptarget.rtl.level_zero PRIVATE 
+                        "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+    libomptarget_add_resource_file(omptarget.rtl.level_zero)
+  else()
+    message(FATAL_ERROR "Missing platform support")
+  endif()
 else()
-   message(FATAL_ERROR "Missing platfrom support")
-endif()
-
-else()
-message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
-get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-if(CMAKE_SYSTEM_NAME MATCHES "Windows")
-   # Windows uses dll instead of lib files at runtime
-   string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
-endif()
-target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
-target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+  message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+  get_filename_component(LEVEL_ZERO_LIBRARY_NAME 
+                         ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+  if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # Windows uses dll instead of lib files at runtime
+    string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME 
+           ${LEVEL_ZERO_LIBRARY_NAME})
+  endif()
+  target_compile_options(omptarget.rtl.level_zero PRIVATE 
+                         "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+  target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
 endif()

>From a2217dbd426065c7b0f831a66947738a636b0c74 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 14:56:40 +0200
Subject: [PATCH 08/70] change to StringRef in multiple places

---
 .../level_zero/include/AsyncQueue.h           |  2 +-
 .../level_zero/include/L0Options.h            | 29 ++-------
 .../level_zero/src/L0Device.cpp               |  7 ++-
 .../level_zero/src/L0Program.cpp              | 59 ++++++++++---------
 4 files changed, 43 insertions(+), 54 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 2d32f1767a7b6..dfa8c54b1c124 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Async Queue wrapper for SPIR-V/Xe machine
+// Async Queue wrapper for Level Zero
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 7e64f71054569..ba62aa9ac0afa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -63,7 +63,6 @@ class SpecConstantsTy {
     return Tmp;
   }
 };
-#define FIXED static constexpr
 
 /// L0 Plugin flags
 struct L0OptionFlagsTy {
@@ -94,7 +93,7 @@ struct L0OptionsTy {
   std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
 
   /// Oversubscription rate for normal kernels
-  FIXED uint32_t SubscriptionRate = 4;
+  uint32_t SubscriptionRate = 4;
 
   /// Loop kernels with known ND-range may be known to have
   /// few iterations and they may not exploit the offload device
@@ -112,7 +111,7 @@ struct L0OptionsTy {
   /// in the kernel should decrease.
   /// Anyway, this is just a heuristics that seems to work well for some
   /// kernels (which poorly expose parallelism in the first place).
-  FIXED double ThinThreadsThreshold = 0.1;
+  double ThinThreadsThreshold = 0.1;
 
   /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
   /// All the discard filter should be before the accept filter.
@@ -127,8 +126,8 @@ struct L0OptionsTy {
   // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
   // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
   // builtins.
-  std::string CompilationOptions = "-cl-std=CL2.0 ";
-  std::string InternalCompilationOptions = "-cl-take-global-address";
+  static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
+  static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
   std::string UserCompilationOptions = "";
 
   // Spec constants used for all modules.
@@ -165,24 +164,8 @@ struct L0OptionsTy {
     return std::all_of(str.begin(), str.end(), ::isdigit);
   }
 
-  bool match(const std::string &Var, const std::string &Matched) {
-    if (Var.size() != Matched.size())
-      return false;
-
-    auto equals = [](char a, char b) {
-      return std::tolower(a) == std::tolower(b);
-    };
-    return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
-                      equals);
-  }
-
-  bool match(const std::string &Var, const char *Matched) {
-    std::string Str(Matched);
-    return match(Var, Str);
-  }
-
-  bool match(const StringEnvar &Var, const char *Matched) {
-    return match(Var.get(), Matched);
+  bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
+    return Matched.equals_insensitive(Var.get());
   }
 
 }; // L0OptionsTy
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 2235741ea70a4..1ef66751655d6 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -413,13 +413,14 @@ L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
   (void)NumEntries; // silence warning
 
   const auto &Options = getPlugin().getOptions();
-  std::string CompilationOptions(Options.CompilationOptions + " " +
-                                 Options.UserCompilationOptions);
+  std::string CompilationOptions(Options.CompilationOptions);
+  CompilationOptions += " " + Options.UserCompilationOptions;
 
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
        "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
 
-  CompilationOptions += " " + Options.InternalCompilationOptions;
+  CompilationOptions += " ";
+  CompilationOptions += Options.InternalCompilationOptions;
   auto &Program = addProgram(ImageId, TgtImage);
 
   int32_t RC = Program.buildModules(CompilationOptions);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 9828f379e681a..e7448757b9141 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -395,7 +395,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
           continue;
 
         const uint64_t Type = Note.getType();
-        std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+        auto DescStrRef = Note.getDescAsStringRef(4);
         switch (Type) {
         default:
           DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
@@ -403,19 +403,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
           break;
         case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
-          ImageCount = std::stoull(DescStr);
+          if (!DescStrRef.getAsInteger(10, ImageCount)) {
+            DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
+               DescStrRef.str().c_str());
+            ImageCount = 0;
+          }
           break;
-        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
-          std::vector<std::string> Parts;
-          do {
-            const auto DelimPos = DescStr.find('\0');
-            if (DelimPos == std::string::npos) {
-              Parts.push_back(std::move(DescStr));
-              break;
-            }
-            Parts.push_back(DescStr.substr(0, DelimPos));
-            DescStr.erase(0, DelimPos + 1);
-          } while (Parts.size() < 4);
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX:
+          llvm::SmallVector<llvm::StringRef, 4> Parts;
+          DescStrRef.split(Parts, '\0', /* MaxSplit = */ 4,
+                           /* KeepEmpty = */ false);
 
           // Ignore records with less than 4 strings.
           if (Parts.size() != 4) {
@@ -424,7 +421,8 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
             continue;
           }
 
-          const uint64_t Idx = std::stoull(Parts[0]);
+          uint64_t Idx = 0;
+          Parts[0].getAsInteger(10, Idx);
           MaxImageIdx = (std::max)(MaxImageIdx, Idx);
           if (AuxInfo.find(Idx) != AuxInfo.end()) {
             DP("Warning: duplicate auxiliary information for image %" PRIu64
@@ -432,13 +430,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
                Idx);
             continue;
           }
+
+          uint64_t Part1Id;
+          Parts[1].getAsInteger(10, Part1Id);
+
           AuxInfo.emplace(
               std::piecewise_construct, std::forward_as_tuple(Idx),
-              std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+              std::forward_as_tuple(Part1Id, Parts[2].str(), Parts[3].str()));
           // Image pointer and size
           // will be initialized later.
         }
-        }
       }
     }
 
@@ -450,24 +451,28 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
       auto ExpectedSectionName = E.getSectionName(Sec);
       assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
                                     "image with invalid section names");
-      std::string SectionName = (*ExpectedSectionName).str();
-      if (SectionName.find(Prefix) != 0)
+      auto &SectionNameRef = *ExpectedSectionName;
+      if (!SectionNameRef.consume_front(Prefix))
         continue;
-      SectionName.erase(0, std::strlen(Prefix));
 
       // Expected section name in split-kernel mode:
       // __openmp_offload_spirv_<image_id>_<part_id>
-      auto PartIdLoc = SectionName.find("_");
-      if (PartIdLoc != std::string::npos) {
-        DP("Found a split section in the image\n");
-        // It seems that we do not need part ID as long as they are ordered
-        // in the image and we keep the ordering in the runtime.
-        SectionName.erase(PartIdLoc);
-      } else {
+      auto Parts = SectionNameRef.split('_');
+      // It seems that we do not need part ID as long as they are ordered
+      // in the image and we keep the ordering in the runtime.
+      SectionNameRef = Parts.first;
+      if (Parts.second.empty()) {
         DP("Found a single section in the image\n");
+      } else {
+        DP("Found a split section in the image\n");
       }
 
-      uint64_t Idx = std::stoull(SectionName);
+      uint64_t Idx = 0;
+      if (!SectionNameRef.getAsInteger(10, Idx)) {
+        DP("Warning: ignoring image section (invalid index '%s').\n",
+           SectionNameRef.str().c_str());
+        continue;
+      }
       if (Idx >= ImageCount) {
         DP("Warning: ignoring image section (index %" PRIu64
            " is out of range).\n",

>From 08880a623e89a5eb7f9deebce9ced19e6d2b9e1a Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 09:49:34 +0200
Subject: [PATCH 09/70] remove tokenize

---
 .../level_zero/include/L0Options.h            |  12 --
 .../level_zero/src/L0Options.cpp              | 106 ++++++------------
 2 files changed, 37 insertions(+), 81 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index ba62aa9ac0afa..b08a07f52fcc0 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -152,18 +152,6 @@ struct L0OptionsTy {
     }
   }
 
-  /// Parse the string and split it into tokens of string_views based on the
-  /// Delim character.
-  static std::vector<std::string_view>
-  tokenize(const std::string_view &Filter, const std::string &Delim,
-           bool ProhibitEmptyTokens = false);
-
-  bool isDigits(const std::string_view &str) {
-    if (str.size() == 0)
-      return false;
-    return std::all_of(str.begin(), str.end(), ::isdigit);
-  }
-
   bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
     return Matched.equals_insensitive(Var.get());
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index cb3a23b3e8bd4..d0871c715b180 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -53,43 +53,51 @@ void L0OptionsTy::processEnvironmentVars() {
   if (DeviceSelectorVar.isPresent()) {
     std::string EnvStr(std::move(DeviceSelectorVar.get()));
     uint32_t numDiscard = 0;
-    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
-                   [](unsigned char C) { return std::tolower(C); });
+    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
 
-    std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+    llvm::StringRef EnvRef(EnvStr);
+    llvm::SmallVector<llvm::StringRef> Entries;
+    EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
+                 /* KeepEmpty = */ false);
     for (const auto &Term : Entries) {
       bool isDiscard = false;
-      std::vector<std::string_view> Pair = tokenize(Term, ":", true);
-      if (Pair.empty()) {
+
+      auto Parts = Term.split(':');
+      if (Parts.first.empty()) {
         FAILURE_MESSAGE(
             "Incomplete selector! Pair and device must be specified.\n");
-      } else if (Pair.size() == 1) {
-        FAILURE_MESSAGE("Incomplete selector!  Try '%s:*'if all devices "
-                        "under the Pair was original intention.\n",
-                        Pair[0].data());
-      } else if (Pair.size() > 2) {
+      }
+      if (Parts.second.empty()) {
+        FAILURE_MESSAGE(
+            "Incomplete selector! Pair and device must be specified.\n");
+      }
+      if (Parts.second.contains(':')) {
         FAILURE_MESSAGE(
             "Error parsing selector string \"%s\" Too many colons (:)\n",
             Term.data());
       }
-      if (!((Pair[0][0] == '*') ||
-            (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
-            (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+
+      if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
+          Parts.first == "!level_zero"))
         break;
-      isDiscard = Pair[0][0] == '!';
+      isDiscard = Parts.first[0] == '!';
+
       if (isDiscard)
         numDiscard++;
       else if (numDiscard > 0)
         FAILURE_MESSAGE("All negative(discarding) filters must appear after "
                         "all positive(accepting) filters!");
 
-      std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+      llvm::SmallVector<llvm::StringRef> Targets;
+      Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
+                         /* KeepEmpty = */ false);
       for (const auto &TargetStr : Targets) {
         bool HasDeviceWildCard = false;
         bool HasSubDeviceWildCard = false;
         bool DeviceNum = false;
-        std::vector<std::string_view> DeviceSubTuple =
-            tokenize(TargetStr, ".", true);
+        llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
+        TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
+                         /* KeepEmpty = */ false);
         int32_t RootD[3] = {-1, -1, -1};
         if (DeviceSubTuple.empty()) {
           FAILURE_MESSAGE(
@@ -97,7 +105,7 @@ void L0OptionsTy::processEnvironmentVars() {
               "specified.");
         }
 
-        std::string_view TopDeviceStr = DeviceSubTuple[0];
+        auto TopDeviceStr = DeviceSubTuple[0];
         static const std::array<std::string, 7> DeviceStr = {
             "host", "cpu", "gpu", "acc", "*"};
         auto It =
@@ -107,15 +115,13 @@ void L0OptionsTy::processEnvironmentVars() {
           if (TopDeviceStr[0] == '*') {
             HasDeviceWildCard = true;
             RootD[0] = -2;
-          } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+          } else if (TopDeviceStr == "gpu")
             continue;
         } else {
-          std::string TDS(TopDeviceStr);
-          if (!isDigits(TDS)) {
+          if (TopDeviceStr.getAsInteger(10, RootD[0])) {
             FAILURE_MESSAGE("error parsing device number: %s",
-                            DeviceSubTuple[0].data());
+                            DeviceSubTuple[0].str().c_str());
           } else {
-            RootD[0] = std::stoi(TDS);
             DeviceNum = true;
           }
         }
@@ -124,7 +130,7 @@ void L0OptionsTy::processEnvironmentVars() {
             FAILURE_MESSAGE("sub-devices can only be requested when parent "
                             "device is specified by number or wildcard, not a "
                             "device type like \'gpu\'");
-          std::string_view SubDeviceStr = DeviceSubTuple[1];
+          auto SubDeviceStr = DeviceSubTuple[1];
           if (SubDeviceStr[0] == '*') {
             HasSubDeviceWildCard = true;
             RootD[1] = -2;
@@ -134,28 +140,24 @@ void L0OptionsTy::processEnvironmentVars() {
                   "sub-device can't be requested by number if parent "
                   "device is specified by a wildcard.");
 
-            std::string SDS(SubDeviceStr);
-            if (!isDigits(SDS)) {
+            if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
               FAILURE_MESSAGE("error parsing subdevice index: %s",
-                              DeviceSubTuple[1].data());
-            } else
-              RootD[1] = std::stoi(SDS);
+                              DeviceSubTuple[1].str().c_str());
+            }
           }
         }
         if (DeviceSubTuple.size() == 3) {
-          std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+          auto SubSubDeviceStr = DeviceSubTuple[2];
           if (SubSubDeviceStr[0] == '*') {
             RootD[2] = -2;
           } else {
             if (HasSubDeviceWildCard)
               FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
                               "sub-device before is specified by a wildcard.");
-            std::string SSDS(SubSubDeviceStr);
-            if (!isDigits(SSDS)) {
+            if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
               FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
-                              DeviceSubTuple[2].data());
-            } else
-              RootD[2] = std::stoi(SSDS);
+                              DeviceSubTuple[2].str().c_str());
+            }
           }
         } else if (DeviceSubTuple.size() > 3) {
           FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
@@ -333,39 +335,5 @@ void L0OptionsTy::processEnvironmentVars() {
                      CommandModeVar.get().c_str());
   }
 }
-/// Parse String  and split into tokens of string_views based on the
-/// Delim character.
-std::vector<std::string_view>
-L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
-                      bool ProhibitEmptyTokens) {
-  std::vector<std::string_view> Tokens;
-  size_t Pos = 0;
-  size_t LastPos = 0;
-  while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
-    std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
-
-    if (!Tok.empty()) {
-      Tokens.push_back(Tok);
-    } else if (ProhibitEmptyTokens) {
-      FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
-                      "before '%s'delimiter is not allowed.",
-                      Delim.c_str());
-    }
-    // move the search starting index
-    LastPos = Pos + 1;
-  }
-
-  // Add remainder if any
-  if (LastPos < Filter.size()) {
-    std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
-    Tokens.push_back(Tok);
-  } else if ((LastPos != 0) && ProhibitEmptyTokens) {
-    // if delimiter is the last sybmol in the string.
-    FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
-                    "'%s' delimiter is not allowed.",
-                    Delim.c_str());
-  }
-  return Tokens;
-}
 
 } // namespace llvm::omp::target::plugin

>From 9a3088c49bc52ce643c63223608802eefc04b924 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 12:47:02 +0200
Subject: [PATCH 10/70] remove unused code

---
 .../level_zero/include/L0Program.h            |  1 -
 .../level_zero/include/L0Trace.h              |  7 ----
 .../level_zero/src/L0Program.cpp              | 38 -------------------
 3 files changed, 46 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index d156cce268182..ca8b3b8a5cf52 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -60,7 +60,6 @@ class L0ProgramTy : public DeviceImageTy {
                     const std::string &BuildOption, ze_module_format_t Format);
   /// Read file and return the size of the binary if successful.
   size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
-  int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
   void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
                                         std::string &Options) const;
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index f8519bd44ae79..0faa76171cbc9 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -27,13 +27,6 @@
       DP(__VA_ARGS__);                                                         \
   } while (0)
 
-#define FATAL_ERROR(Msg)                                                       \
-  do {                                                                         \
-    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
-    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
-    exit(EXIT_FAILURE);                                                        \
-  } while (0)
-
 #define WARNING(...)                                                           \
   do {                                                                         \
     fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index e7448757b9141..eb5da943d56c9 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -187,44 +187,6 @@ size_t L0ProgramTy::readFile(const char *FileName,
   return FileSize;
 }
 
-/// Read SPV from file name
-int32_t L0ProgramTy::readSPVFile(const char *FileName,
-                                 std::vector<uint8_t> &OutSPV) const {
-  // Resolve full path using the location of the plugin
-  std::string FullPath;
-#ifdef _WIN32
-  char RTLPath[_MAX_PATH];
-  HMODULE RTLModule = nullptr;
-  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
-                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                          (LPCSTR)&__tgt_target_data_begin_nowait,
-                          &RTLModule)) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  FullPath = RTLPath;
-#else  // _WIN32
-  Dl_info RTLInfo;
-  if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  FullPath = RTLInfo.dli_fname;
-#endif // _WIN32
-  const size_t PathSep = FullPath.find_last_of("/\\");
-  FullPath.replace(PathSep + 1, std::string::npos, FileName);
-  // Read from the full path
-  if (!readFile(FullPath.c_str(), OutSPV)) {
-    DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
 void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
                                                    std::string &Options) const {
   // Options that need to be replaced with backend-specific options

>From 24d06455603aaf372ce20a69b776ba237f1edaaa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 13:04:12 +0200
Subject: [PATCH 11/70] fix format

---
 offload/plugins-nextgen/level_zero/include/L0Options.h | 3 ++-
 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp    | 4 ++--
 offload/plugins-nextgen/level_zero/src/L0Options.cpp   | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b08a07f52fcc0..e383f070f10aa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -127,7 +127,8 @@ struct L0OptionsTy {
   // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
   // builtins.
   static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
-  static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
+  static constexpr std::string_view InternalCompilationOptions =
+      "-cl-take-global-address";
   std::string UserCompilationOptions = "";
 
   // Spec constants used for all modules.
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b0a13a07ab919..538e627405b6d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,8 +163,8 @@ void L0KernelTy::decideKernelGroupArguments(
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
   bool UsedReductionSubscriptionRate = false;
   if (!MaxGroupCountForced) {
-    { 
-      GRPCounts[0] *= OptSubscRate; 
+    {
+      GRPCounts[0] *= OptSubscRate;
     }
 
     size_t LoopTripcount = 0;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index d0871c715b180..1e0baa3f2b089 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -78,7 +78,7 @@ void L0OptionsTy::processEnvironmentVars() {
       }
 
       if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
-          Parts.first == "!level_zero"))
+            Parts.first == "!level_zero"))
         break;
       isDiscard = Parts.first[0] == '!';
 
@@ -97,7 +97,7 @@ void L0OptionsTy::processEnvironmentVars() {
         bool DeviceNum = false;
         llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
         TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
-                         /* KeepEmpty = */ false);
+                        /* KeepEmpty = */ false);
         int32_t RootD[3] = {-1, -1, -1};
         if (DeviceSubTuple.empty()) {
           FAILURE_MESSAGE(
@@ -106,8 +106,8 @@ void L0OptionsTy::processEnvironmentVars() {
         }
 
         auto TopDeviceStr = DeviceSubTuple[0];
-        static const std::array<std::string, 7> DeviceStr = {
-            "host", "cpu", "gpu", "acc", "*"};
+        static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
+                                                             "gpu", "acc", "*"};
         auto It =
             find_if(DeviceStr.begin(), DeviceStr.end(),
                     [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });

>From 0eb57125a5caff61ac4de16256965f4fad0ad120 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 16:31:25 +0200
Subject: [PATCH 12/70] Remove environment variable

---
 .../level_zero/include/L0Options.h            |   7 -
 .../level_zero/src/L0Options.cpp              | 162 ------------------
 .../level_zero/src/L0Plugin.cpp               |  10 +-
 .../level_zero/src/L0Program.cpp              |   5 +-
 4 files changed, 4 insertions(+), 180 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index e383f070f10aa..8c79a82ef724b 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -113,13 +113,6 @@ struct L0OptionsTy {
   /// kernels (which poorly expose parallelism in the first place).
   double ThinThreadsThreshold = 0.1;
 
-  /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
-  /// All the discard filter should be before the accept filter.
-  std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
-
-  /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
-  bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
-
   // Compilation options for IGC
   // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
   // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 1e0baa3f2b089..7229e2498ae13 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -18,29 +18,6 @@
 
 namespace llvm::omp::target::plugin {
 
-/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
-bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
-                                  int32_t CCSID) const {
-  if (ExplicitRootDevices.empty())
-    return false;
-  for (const auto &RootDev : ExplicitRootDevices) {
-    const auto ErootID = std::get<1>(RootDev);
-    if (ErootID != -2 && RootID != ErootID)
-      continue;
-    const auto EsubID = std::get<2>(RootDev);
-    if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
-      continue;
-    const auto ECCSID = std::get<3>(RootDev);
-    if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
-      continue;
-    // Check if isDiscard
-    if (!std::get<0>(RootDev))
-      return false;
-    return true;
-  }
-  return false;
-}
-
 /// Read environment variables
 void L0OptionsTy::processEnvironmentVars() {
   // Compilation options for IGC
@@ -48,145 +25,6 @@ void L0OptionsTy::processEnvironmentVars() {
       std::string(" ") +
       StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
 
-  // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
-  const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
-  if (DeviceSelectorVar.isPresent()) {
-    std::string EnvStr(std::move(DeviceSelectorVar.get()));
-    uint32_t numDiscard = 0;
-    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
-
-    llvm::StringRef EnvRef(EnvStr);
-    llvm::SmallVector<llvm::StringRef> Entries;
-    EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
-                 /* KeepEmpty = */ false);
-    for (const auto &Term : Entries) {
-      bool isDiscard = false;
-
-      auto Parts = Term.split(':');
-      if (Parts.first.empty()) {
-        FAILURE_MESSAGE(
-            "Incomplete selector! Pair and device must be specified.\n");
-      }
-      if (Parts.second.empty()) {
-        FAILURE_MESSAGE(
-            "Incomplete selector! Pair and device must be specified.\n");
-      }
-      if (Parts.second.contains(':')) {
-        FAILURE_MESSAGE(
-            "Error parsing selector string \"%s\" Too many colons (:)\n",
-            Term.data());
-      }
-
-      if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
-            Parts.first == "!level_zero"))
-        break;
-      isDiscard = Parts.first[0] == '!';
-
-      if (isDiscard)
-        numDiscard++;
-      else if (numDiscard > 0)
-        FAILURE_MESSAGE("All negative(discarding) filters must appear after "
-                        "all positive(accepting) filters!");
-
-      llvm::SmallVector<llvm::StringRef> Targets;
-      Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
-                         /* KeepEmpty = */ false);
-      for (const auto &TargetStr : Targets) {
-        bool HasDeviceWildCard = false;
-        bool HasSubDeviceWildCard = false;
-        bool DeviceNum = false;
-        llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
-        TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
-                        /* KeepEmpty = */ false);
-        int32_t RootD[3] = {-1, -1, -1};
-        if (DeviceSubTuple.empty()) {
-          FAILURE_MESSAGE(
-              "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
-              "specified.");
-        }
-
-        auto TopDeviceStr = DeviceSubTuple[0];
-        static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
-                                                             "gpu", "acc", "*"};
-        auto It =
-            find_if(DeviceStr.begin(), DeviceStr.end(),
-                    [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
-        if (It != DeviceStr.end()) {
-          if (TopDeviceStr[0] == '*') {
-            HasDeviceWildCard = true;
-            RootD[0] = -2;
-          } else if (TopDeviceStr == "gpu")
-            continue;
-        } else {
-          if (TopDeviceStr.getAsInteger(10, RootD[0])) {
-            FAILURE_MESSAGE("error parsing device number: %s",
-                            DeviceSubTuple[0].str().c_str());
-          } else {
-            DeviceNum = true;
-          }
-        }
-        if (DeviceSubTuple.size() >= 2) {
-          if (!DeviceNum && !HasDeviceWildCard)
-            FAILURE_MESSAGE("sub-devices can only be requested when parent "
-                            "device is specified by number or wildcard, not a "
-                            "device type like \'gpu\'");
-          auto SubDeviceStr = DeviceSubTuple[1];
-          if (SubDeviceStr[0] == '*') {
-            HasSubDeviceWildCard = true;
-            RootD[1] = -2;
-          } else {
-            if (HasDeviceWildCard) // subdevice is a number and device is a *
-              FAILURE_MESSAGE(
-                  "sub-device can't be requested by number if parent "
-                  "device is specified by a wildcard.");
-
-            if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
-              FAILURE_MESSAGE("error parsing subdevice index: %s",
-                              DeviceSubTuple[1].str().c_str());
-            }
-          }
-        }
-        if (DeviceSubTuple.size() == 3) {
-          auto SubSubDeviceStr = DeviceSubTuple[2];
-          if (SubSubDeviceStr[0] == '*') {
-            RootD[2] = -2;
-          } else {
-            if (HasSubDeviceWildCard)
-              FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
-                              "sub-device before is specified by a wildcard.");
-            if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
-              FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
-                              DeviceSubTuple[2].str().c_str());
-            }
-          }
-        } else if (DeviceSubTuple.size() > 3) {
-          FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
-                          "supported at this time ",
-                          TargetStr.data());
-        }
-        if (isDiscard)
-          ExplicitRootDevices.insert(
-              ExplicitRootDevices.begin(),
-              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
-                                                          RootD[1], RootD[2]));
-        else
-          ExplicitRootDevices.push_back(
-              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
-                                                          RootD[1], RootD[2]));
-      }
-    }
-  }
-
-  DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
-     ExplicitRootDevices.size());
-  DP("  (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
-     "-1(ignore)\n");
-  for (auto &T : ExplicitRootDevices) {
-    DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
-       std::get<2>(T), std::get<3>(T));
-    (void)T; // silence warning
-  }
-
   // Memory pool
   // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
   //  <Option>       := 0 | <PoolInfoList>
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 51d6595560484..d632d57ce3d5d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -35,7 +35,6 @@ int32_t LevelZeroPluginTy::findDevices() {
     DP("Cannot find any drivers.\n");
     return 0;
   }
-  const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
 
   // We expect multiple drivers on Windows to support different device types,
   // so we need to maintain multiple drivers and contexts in general.
@@ -93,13 +92,10 @@ int32_t LevelZeroPluginTy::findDevices() {
 
   llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
 
-  // helper lambdas
-  auto addDevice = [ExplicitMode,
-                    &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+  // helper lambda
+  auto addDevice = [&DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
                                    int32_t SubId = -1, int32_t CCSId = -1) {
-    if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
-      DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
-    }
+    DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
   };
   for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
     const auto zeDevice = RootDevices[RootId].zeDevice;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index eb5da943d56c9..8b31bf7e3a7ec 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -128,11 +128,8 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
 
   const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
 
-  if (BuildFailed) {
-    if (IsLibModule)
-      return OFFLOAD_SUCCESS;
+  if (BuildFailed)
     return OFFLOAD_FAIL;
-  }
 
   // Check if module link is required. We do not need this check for
   // library module

>From f491f3dc412729f8e8d4288f7c12ecf659a14dcc Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 18:42:05 +0200
Subject: [PATCH 13/70] fix getAsInteger conditions

---
 offload/plugins-nextgen/level_zero/src/L0Program.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 8b31bf7e3a7ec..68ef755b2a852 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -362,7 +362,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
           break;
         case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
-          if (!DescStrRef.getAsInteger(10, ImageCount)) {
+          if (DescStrRef.getAsInteger(10, ImageCount)) {
             DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
                DescStrRef.str().c_str());
             ImageCount = 0;
@@ -427,7 +427,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
       }
 
       uint64_t Idx = 0;
-      if (!SectionNameRef.getAsInteger(10, Idx)) {
+      if (SectionNameRef.getAsInteger(10, Idx)) {
         DP("Warning: ignoring image section (invalid index '%s').\n",
            SectionNameRef.str().c_str());
         continue;

>From fe633f4a563e97945e87ec937b8e56fe89e7264d Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Fri, 19 Sep 2025 14:35:22 +0200
Subject: [PATCH 14/70] Don't use __tgt_device_image

---
 .../level_zero/include/L0Device.h             | 10 ++--
 .../level_zero/include/L0Program.h            |  6 +--
 .../level_zero/src/L0Device.cpp               | 15 ++----
 .../level_zero/src/L0Program.cpp              | 46 ++++---------------
 4 files changed, 20 insertions(+), 57 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index e22cfd928c0af..df20cfe09c304 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -346,9 +346,9 @@ class L0DeviceTy final : public GenericDeviceTy {
   }
   auto getGlobalModulesArray() { return GlobalModules.data(); }
 
-  L0ProgramTy *getProgramFromImage(const __tgt_device_image *Image) {
+  L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) {
     for (auto &PGM : Programs)
-      if (PGM.getTgtImage() == Image)
+      if (PGM.getMemoryBuffer() == Image)
         return &PGM;
     return nullptr;
   }
@@ -363,8 +363,8 @@ class L0DeviceTy final : public GenericDeviceTy {
   }
 
   // add a new program to the device. Return a reference to the new program
-  auto &addProgram(int32_t ImageId, const __tgt_device_image *Image) {
-    Programs.emplace_back(ImageId, *this, Image);
+  auto &addProgram(int32_t ImageId, std::unique_ptr<MemoryBuffer> &&Image) {
+    Programs.emplace_back(ImageId, *this, std::move(Image));
     return Programs.back();
   }
 
@@ -575,7 +575,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   int32_t makeMemoryResident(void *Mem, size_t Size);
 
   // Generic device interface implementation
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+  Expected<DeviceImageTy *> loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
                                            int32_t ImageId) override;
   Error unloadBinaryImpl(DeviceImageTy *Image) override;
   void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index ca8b3b8a5cf52..7517b57764b87 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -72,8 +72,8 @@ class L0ProgramTy : public DeviceImageTy {
   L0ProgramTy() = delete;
 
   L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
-              const __tgt_device_image *Image)
-      : DeviceImageTy(ImageId, Device, Image) {}
+               std::unique_ptr<MemoryBuffer> Image)
+      : DeviceImageTy(ImageId, Device, std::move(Image)) {}
 
   ~L0ProgramTy();
 
@@ -128,8 +128,6 @@ struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
                                     GlobalTy &DeviceGlobal) override;
 };
 
-bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
-                        uint64_t &MinorVer);
 bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
                         uint64_t &MinorVer);
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 1ef66751655d6..0e436bc7ead7e 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -393,9 +393,9 @@ int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
 }
 
 Expected<DeviceImageTy *>
-L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
+L0DeviceTy::loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
                            int32_t ImageId) {
-  auto *PGM = getProgramFromImage(TgtImage);
+  auto *PGM = getProgramFromImage(TgtImage->getMemBufferRef());
   if (PGM) {
     // Program already exists
     return PGM;
@@ -403,14 +403,7 @@ L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
 
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
        "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
-       DPxPTR(TgtImage->ImageStart));
-
-  const size_t NumEntries =
-      (size_t)(TgtImage->EntriesEnd - TgtImage->EntriesBegin);
-
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
-       "Expecting to have %zu entries defined\n", NumEntries);
-  (void)NumEntries; // silence warning
+       DPxPTR(TgtImage->getBufferStart()));
 
   const auto &Options = getPlugin().getOptions();
   std::string CompilationOptions(Options.CompilationOptions);
@@ -421,7 +414,7 @@ L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
 
   CompilationOptions += " ";
   CompilationOptions += Options.InternalCompilationOptions;
-  auto &Program = addProgram(ImageId, TgtImage);
+  auto &Program = addProgram(ImageId, std::move(TgtImage));
 
   int32_t RC = Program.buildModules(CompilationOptions);
   if (RC != OFFLOAD_SUCCESS)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 68ef755b2a852..524e51dd256c6 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -30,10 +30,8 @@ Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
                                                      GlobalTy &DeviceGlobal) {
   const char *GlobalName = DeviceGlobal.getName().data();
 
-  L0DeviceTy &l0Device = static_cast<L0DeviceTy &>(Device);
-  const L0ProgramTy *Program =
-      l0Device.getProgramFromImage(Image.getTgtImage());
-  void *Addr = Program->getOffloadVarDeviceAddr(GlobalName);
+  L0ProgramTy &Program = L0ProgramTy::makeL0Program(Image);
+  void *Addr = Program.getOffloadVarDeviceAddr(GlobalName);
 
   // Save the pointer to the symbol allowing nullptr.
   DeviceGlobal.setPtr(Addr);
@@ -65,18 +63,9 @@ void L0ProgramTy::setLibModule() {
 #if _WIN32
   return;
 #else
-  const auto *Image = getTgtImage();
-  const size_t NumEntries =
-      static_cast<size_t>(Image->EntriesEnd - Image->EntriesBegin);
-  for (size_t I = 0; I < NumEntries; I++) {
-    const auto &Entry = Image->EntriesBegin[I];
-    // Image contains a kernel, so it is not compiled as a library module
-    if (Entry.SymbolName && Entry.Size == 0)
-      return;
-  }
   // Check if the image belongs to a dynamic library
   Dl_info DLI{nullptr};
-  if (dladdr(Image->ImageStart, &DLI) && DLI.dli_fname) {
+  if (dladdr(getStart(), &DLI) && DLI.dli_fname) {
     std::vector<uint8_t> FileBin;
     auto Size = readFile(DLI.dli_fname, FileBin);
     if (Size) {
@@ -277,32 +266,18 @@ bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
   return Res;
 }
 
-static StringRef getImageStringRef(const __tgt_device_image *Image) {
-  const char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
-  const char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
-  const size_t ImgSize = ImgEnd - ImgBegin;
-  return StringRef(ImgBegin, ImgSize);
-}
-
-bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
-                        uint64_t &MinorVer) {
-  return isValidOneOmpImage(getImageStringRef(Image), MajorVer, MinorVer);
-}
-
 int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
   auto &l0Device = getL0Device();
-  auto *Image = getTgtImage();
-  if (identify_magic(getImageStringRef(Image)) == file_magic::spirv_object) {
+  auto Image = getMemoryBuffer();
+  if (identify_magic(Image.getBuffer()) == file_magic::spirv_object) {
     // Handle legacy plain SPIR-V image.
-    uint8_t *ImgBegin = reinterpret_cast<uint8_t *>(Image->ImageStart);
-    uint8_t *ImgEnd = reinterpret_cast<uint8_t *>(Image->ImageEnd);
-    size_t ImgSize = ImgEnd - ImgBegin;
-    return addModule(ImgSize, ImgBegin, BuildOptions,
+    const uint8_t *ImgBegin = reinterpret_cast<const uint8_t *>(getStart());
+    return addModule(getSize(), ImgBegin, BuildOptions,
                      ZE_MODULE_FORMAT_IL_SPIRV);
   }
 
   uint64_t MajorVer, MinorVer;
-  if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+  if (!isValidOneOmpImage(Image.getBuffer(), MajorVer, MinorVer)) {
     DP("Warning: image is not a valid oneAPI OpenMP image.\n");
     return OFFLOAD_FAIL;
   }
@@ -326,11 +301,8 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
   };
   std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
 
-  auto MB = MemoryBuffer::getMemBuffer(getImageStringRef(Image),
-                                       /*BufferName=*/"",
-                                       /*RequiresNullTerminator=*/false);
   auto ExpectedNewE =
-      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+      ELFObjectFileBase::createELFObjectFile(Image);
   assert(ExpectedNewE &&
          "isValidOneOmpImage() returns true for invalid ELF image");
   auto processELF = [&](auto *EObj) {

>From e36a8fcb9bb7bc42f9de0dc36e613ae78a00149b Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Fri, 19 Sep 2025 14:36:03 +0200
Subject: [PATCH 15/70] fix image checking

---
 offload/plugins-nextgen/level_zero/src/L0Program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 524e51dd256c6..e1595792a5957 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -343,7 +343,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX:
           llvm::SmallVector<llvm::StringRef, 4> Parts;
           DescStrRef.split(Parts, '\0', /* MaxSplit = */ 4,
-                           /* KeepEmpty = */ false);
+                           /* KeepEmpty = */ true);
 
           // Ignore records with less than 4 strings.
           if (Parts.size() != 4) {

>From 53bce7eabecef2684612d2984c9b016ced97f180 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Sun, 21 Sep 2025 16:49:31 +0200
Subject: [PATCH 16/70] changes from review comments

---
 offload/CMakeLists.txt                        |  3 +-
 .../Modules/LibomptargetGetDependencies.cmake |  1 -
 .../plugins-nextgen/level_zero/CMakeLists.txt |  1 -
 .../level_zero/include/AsyncQueue.h           |  8 ++---
 .../level_zero/include/L0Context.h            |  4 +--
 .../level_zero/include/L0Device.h             | 19 ++++++-----
 .../level_zero/include/L0Kernel.h             | 32 +++++++++----------
 .../level_zero/include/L0Memory.h             |  4 +--
 .../level_zero/include/L0Program.h            |  5 +--
 .../level_zero/src/L0Device.cpp               | 16 +++++-----
 .../level_zero/src/L0Kernel.cpp               | 22 +++++++------
 .../level_zero/src/L0Memory.cpp               |  4 +--
 .../level_zero/src/L0Program.cpp              | 11 +++----
 13 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 3432ca3c29059..50e13045947eb 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -180,13 +180,12 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
         CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
   if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
     message(STATUS "Not building Level Zero plugin: it is only supported on "
-	           "Linux/Windows x86_64 or ppc64le hosts")
+                   "Linux/Windows x86_64 or ppc64le hosts")
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
   endif()
 endif()
 if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
 		NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-    message(STATUS "Not building Level Zero plugin: dependencies not found")
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
 endif()
 message(STATUS "Building the offload library with support for "
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 0af0ae1ecdbec..240f838dbd934 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -92,7 +92,6 @@ endif()
 ################################################################################
 # Looking for Level0
 ################################################################################
-message(STATUS "Looking for Level0 includes.")
 find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
 
 if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index df38671c040ab..ecc8f11884671 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -25,7 +25,6 @@ target_sources(omptarget.rtl.level_zero PRIVATE
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
   ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index dfa8c54b1c124..b07e0a7790f6b 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -13,9 +13,7 @@
 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 
-#include <list>
 #include <tuple>
-#include <vector>
 
 #include "L0Memory.h"
 
@@ -27,11 +25,11 @@ namespace plugin {
 /// Abstract queue that supports asynchronous command submission
 struct AsyncQueueTy {
   /// List of events attahced to submitted commands
-  std::vector<ze_event_handle_t> WaitEvents;
+  llvm::SmallVector<ze_event_handle_t> WaitEvents;
   /// Pending staging buffer to host copies
-  std::list<std::tuple<void *, void *, size_t>> H2MList;
+  llvm::SmallVector<std::tuple<void *, void *, size_t>> H2MList;
   /// Pending USM memory copy commands that must wait for kernel completion
-  std::list<std::tuple<const void *, void *, size_t>> USM2MList;
+  llvm::SmallVector<std::tuple<const void *, void *, size_t>> USM2MList;
   /// Kernel event not signaled
   ze_event_handle_t KernelEvent = nullptr;
   /// Is this queue being used currently
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 69748a3e61d01..29d01bb7b2a2a 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -55,7 +55,7 @@ class L0ContextTy {
 
   /// Imported external pointers. Track this only for user-directed
   /// imports/releases.
-  std::unordered_map<uintptr_t, size_t> ImportedPtrs;
+  llvm::DenseMap<uintptr_t, size_t> ImportedPtrs;
 
   /// Common event pool
   EventPoolTy EventPool;
@@ -92,7 +92,7 @@ class L0ContextTy {
 
   /// Add imported external pointer region.
   void addImported(void *Ptr, size_t Size) {
-    (void)ImportedPtrs.emplace((uintptr_t)Ptr, Size);
+    (void)ImportedPtrs.try_emplace((uintptr_t)Ptr, Size);
   }
 
   /// Remove imported external pointer region
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index df20cfe09c304..61e3e381c1ebb 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -287,7 +287,7 @@ class L0DeviceTy final : public GenericDeviceTy {
 public:
   L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
              ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
-             const std::string &zeId, int32_t ComputeIndex)
+             const std::string_view zeId, int32_t ComputeIndex)
       : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
         l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
         ComputeIndex(ComputeIndex) {
@@ -301,10 +301,9 @@ class L0DeviceTy final : public GenericDeviceTy {
     CacheProperties.pNext = nullptr;
 
     auto Err = internalInit();
-    if (Err) {
+    if (Err)
       FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
                     toString(std::move(Err)).c_str());
-    }
   }
 
   static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
@@ -329,10 +328,10 @@ class L0DeviceTy final : public GenericDeviceTy {
   const L0ContextTy &getL0Context() const { return l0Context; }
   L0ContextTy &getL0Context() { return l0Context; }
 
-  const std::string &getName() const { return DeviceName; }
+  const std::string_view getName() const { return DeviceName; }
   const char *getNameCStr() const { return DeviceName.c_str(); }
 
-  const std::string &getZeId() const { return zeId; }
+  const std::string_view getZeId() const { return zeId; }
   const char *getZeIdCStr() const { return zeId.c_str(); }
 
   std::mutex &getMutex() { return Mutex; }
@@ -429,7 +428,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
   bool isDeviceIPorNewer(uint32_t Version) const;
 
-  const std::string &getUuid() const { return DeviceUuid; }
+  const std::string_view getUuid() const { return DeviceUuid; }
 
   uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
   uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
@@ -458,13 +457,13 @@ class L0DeviceTy final : public GenericDeviceTy {
                                          ze_device_handle_t Device,
                                          uint32_t Ordinal,
                                          ze_command_list_flags_t Flags,
-                                         const std::string &DeviceIdStr);
+                                         const std::string_view DeviceIdStr);
 
   /// Create a command list with default flags
   ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
                                          ze_device_handle_t Device,
                                          uint32_t Ordinal,
-                                         const std::string &DeviceIdStr);
+                                         const std::string_view DeviceIdStr);
 
   ze_command_list_handle_t getCmdList();
 
@@ -473,13 +472,13 @@ class L0DeviceTy final : public GenericDeviceTy {
                                            ze_device_handle_t Device,
                                            uint32_t Ordinal, uint32_t Index,
                                            ze_command_queue_flags_t Flags,
-                                           const std::string &DeviceIdStr);
+                                           const std::string_view DeviceIdStr);
 
   /// Create a command queue with default flags
   ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
                                            ze_device_handle_t Device,
                                            uint32_t Ordinal, uint32_t Index,
-                                           const std::string &DeviceIdStr,
+                                           const std::string_view DeviceIdStr,
                                            bool InOrder = false);
 
   /// Create a new command queue for the given OpenMP device ID
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index eca416d6fa882..abba4b03b01b7 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -59,34 +59,34 @@ struct KernelPropertiesTy {
 
   /// Check if we can reuse group parameters.
   bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
-                        const int32_t _NumTeams, const int32_t _ThreadLimit,
-                        uint32_t *_GroupSizes, ze_group_count_t &_GroupCounts,
-                        bool &_AllowCooperative) const {
+                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        uint32_t *GroupSizesOut, ze_group_count_t &GroupCountsOut,
+                        bool &AllowCooperativeOut) const {
     if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
       return false;
     if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
       return false;
-    if (_NumTeams != NumTeams || _ThreadLimit != ThreadLimit)
+    if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
       return false;
     // Found matching input parameters.
-    std::copy_n(GroupSizes, 3, _GroupSizes);
-    _GroupCounts = GroupCounts;
-    _AllowCooperative = AllowCooperative;
+    std::copy_n(GroupSizes, 3, GroupSizesOut);
+    GroupCountsOut = GroupCounts;
+    AllowCooperativeOut = AllowCooperative;
     return true;
   }
 
   /// Update cached group parameters.
   void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
-                        const int32_t _NumTeams, const int32_t _ThreadLimit,
-                        const uint32_t *_GroupSizes,
-                        const ze_group_count_t &_GroupCounts,
-                        const bool &_AllowCooperative) {
+                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        const uint32_t *GroupSizesIn,
+                        const ze_group_count_t &GroupCountsIn,
+                        const bool &AllowCooperativeIn) {
     LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
-    NumTeams = _NumTeams;
-    ThreadLimit = _ThreadLimit;
-    std::copy_n(_GroupSizes, 3, GroupSizes);
-    GroupCounts = _GroupCounts;
-    AllowCooperative = _AllowCooperative;
+    NumTeams = NumTeamsIn;
+    ThreadLimit = ThreadLimitIn;
+    std::copy_n(GroupSizesIn, 3, GroupSizes);
+    GroupCounts = GroupCountsIn;
+    AllowCooperative = AllowCooperativeIn;
   }
 };
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 63115b1a3c529..194f6ab6d0f20 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -195,7 +195,7 @@ class MemAllocatorTy {
     /// List of bucket parameters
     std::vector<std::pair<size_t, size_t>> BucketParams;
     /// Map from allocated pointer to corresponding block.
-    std::unordered_map<void *, BlockTy *> PtrToBlock;
+    llvm::DenseMap<void *, BlockTy *> PtrToBlock;
     /// Simple stats counting miss/hit in each bucket.
     std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
     /// Need to zero-initialize after L0 allocation
@@ -300,7 +300,7 @@ class MemAllocatorTy {
   /// Allocation information map
   MemAllocInfoMapTy AllocInfo;
   /// RTL-owned memory that needs to be freed automatically
-  std::list<void *> MemOwned;
+  std::vector<void *> MemOwned;
   /// Lock protection
   std::mutex Mtx;
   /// Allocator only supports host memory
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index 7517b57764b87..ca82a78653f9d 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -57,7 +57,8 @@ class L0ProgramTy : public DeviceImageTy {
 
   /// Build a single module with the given image, build option, and format.
   int32_t addModule(const size_t Size, const uint8_t *Image,
-                    const std::string &BuildOption, ze_module_format_t Format);
+                    const std::string_view BuildOption,
+                    ze_module_format_t Format);
   /// Read file and return the size of the binary if successful.
   size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
   void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
@@ -87,7 +88,7 @@ class L0ProgramTy : public DeviceImageTy {
   }
 
   /// Build modules from the target image description
-  int32_t buildModules(std::string &BuildOptions);
+  int32_t buildModules(const std::string_view BuildOptions);
 
   /// Link modules stored in \p Modules.
   int32_t linkModules();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 0e436bc7ead7e..f299016e3bfa1 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -159,7 +159,7 @@ void L0DeviceTy::reportDeviceInfo() const {
   DP("Device %" PRIu32 "\n", DeviceId);
   DP("-- Name                         : %s\n", getNameCStr());
   DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
-  DP("-- UUID                         : %s\n", getUuid().c_str());
+  DP("-- UUID                         : %s\n", getUuid().data());
   DP("-- Number of total EUs          : %" PRIu32 "\n", getNumEUs());
   DP("-- Number of threads per EU     : %" PRIu32 "\n", getNumThreadsPerEU());
   DP("-- EU SIMD width                : %" PRIu32 "\n", getSIMDWidth());
@@ -558,7 +558,7 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
   Info.add("Device Number", getDeviceId());
   Info.add("Device Name", getNameCStr());
   Info.add("Device PCI ID", getPCIId());
-  Info.add("Device UUID", getUuid().c_str());
+  Info.add("Device UUID", getUuid().data());
   Info.add("Number of total EUs", getNumEUs());
   Info.add("Number of threads per EU", getNumThreadsPerEU());
   Info.add("EU SIMD width", getSIMDWidth());
@@ -814,7 +814,7 @@ int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
 /// Create a command list with given ordinal and flags
 ze_command_list_handle_t L0DeviceTy::createCmdList(
     ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
-    ze_command_list_flags_t Flags, const std::string &DeviceIdStr) {
+    ze_command_list_flags_t Flags, const std::string_view DeviceIdStr) {
   ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
                                         nullptr, // extension
                                         Ordinal, Flags};
@@ -823,7 +823,7 @@ ze_command_list_handle_t L0DeviceTy::createCmdList(
                    &cmdList);
   DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
      ") for device %s.\n",
-     DPxPTR(cmdList), Ordinal, DeviceIdStr.c_str());
+     DPxPTR(cmdList), Ordinal, DeviceIdStr.data());
   return cmdList;
 }
 
@@ -831,7 +831,7 @@ ze_command_list_handle_t L0DeviceTy::createCmdList(
 ze_command_list_handle_t
 L0DeviceTy::createCmdList(ze_context_handle_t Context,
                           ze_device_handle_t Device, uint32_t Ordinal,
-                          const std::string &DeviceIdStr) {
+                          const std::string_view DeviceIdStr) {
   return (Ordinal == UINT32_MAX)
              ? nullptr
              : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
@@ -853,7 +853,7 @@ ze_command_queue_handle_t
 L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
                            ze_device_handle_t Device, uint32_t Ordinal,
                            uint32_t Index, ze_command_queue_flags_t Flags,
-                           const std::string &DeviceIdStr) {
+                           const std::string_view DeviceIdStr) {
   ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
                                           nullptr, // extension
                                           Ordinal,
@@ -866,14 +866,14 @@ L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
                    &cmdQueue);
   DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
      ", Flags: %" PRIu32 ") for device %s.\n",
-     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.c_str());
+     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.data());
   return cmdQueue;
 }
 
 /// Create a command queue with default flags
 ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
     ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
-    uint32_t Index, const std::string &DeviceIdStr, bool InOrder) {
+    uint32_t Index, const std::string_view DeviceIdStr, bool InOrder) {
   ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
   return (Ordinal == UINT32_MAX) ? nullptr
                                  : createCmdQueue(Context, Device, Ordinal,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 538e627405b6d..24e8e07e460ea 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -58,15 +58,15 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
 /// are subject to change at any program point, so every kernel execution
 /// needs to read the most recent values.
 static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
-  int ThrLimit;
-  ThrLimit = omp_get_teams_thread_limit();
-  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+  int32_t ThreadLimit;
+  ThreadLimit = omp_get_teams_thread_limit();
+  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThreadLimit);
   // omp_get_thread_limit() would return INT_MAX by default.
   // NOTE: Windows.h defines max() macro, so we have to guard
   //       the call with parentheses.
-  int32_t ThreadLimit =
-      (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
-          ? ThrLimit
+  ThreadLimit =
+      (ThreadLimit > 0 && ThreadLimit != (std::numeric_limits<int32_t>::max)())
+          ? ThreadLimit
           : 0;
 
   int NTeams = omp_get_max_teams();
@@ -215,10 +215,14 @@ void L0KernelTy::decideKernelGroupArguments(
 // a loop kernel compiled with the given SIMDWidth, and the given
 // loop(s) trip counts and group sizes.
 // Returns UINT64_MAX, if computations overflow.
-static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
-                                     const uint32_t (&GroupSizes)[3],
+static uint64_t computeThreadsNeeded(const llvm::ArrayRef<size_t> TripCounts,
+                                     const llvm::ArrayRef<uint32_t> GroupSizes,
                                      uint32_t SIMDWidth) {
-  uint64_t GroupCount[3];
+  assert(TripCounts.size() == 3 && "Invalid trip counts array size");
+  assert(GroupSizes.size() == 3 && "Invalid group sizes array size");
+  // Compute the number of groups in each dimension.
+  std::array<uint64_t, 3> GroupCount;
+
   for (int I = 0; I < 3; ++I) {
     if (TripCounts[I] == 0 || GroupSizes[I] == 0)
       return (std::numeric_limits<uint64_t>::max)();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 790acdd9f568f..fe80783c699d2 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -249,7 +249,7 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
       continue;
     Mem = Block->alloc();
     assert(Mem && "Inconsistent state while allocating memory from pool");
-    PtrToBlock.emplace(Mem, Block);
+    PtrToBlock.try_emplace(Mem, Block);
     break;
   }
 
@@ -276,7 +276,7 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
     BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
     Blocks.push_back(Block);
     Mem = Block->alloc();
-    PtrToBlock.emplace(Mem, Block);
+    PtrToBlock.try_emplace(Mem, Block);
     if (IsFull)
       SmallPoolSize += BlockSize;
     else
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index e1595792a5957..3be9c6d410b43 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -64,7 +64,7 @@ void L0ProgramTy::setLibModule() {
   return;
 #else
   // Check if the image belongs to a dynamic library
-  Dl_info DLI{nullptr};
+  Dl_info DLI{nullptr, nullptr, nullptr, nullptr};
   if (dladdr(getStart(), &DLI) && DLI.dli_fname) {
     std::vector<uint8_t> FileBin;
     auto Size = readFile(DLI.dli_fname, FileBin);
@@ -88,7 +88,7 @@ void L0ProgramTy::setLibModule() {
 }
 
 int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
-                               const std::string &CommonBuildOptions,
+                               const std::string_view CommonBuildOptions,
                                ze_module_format_t Format) {
   const ze_module_constants_t SpecConstants =
       LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
@@ -266,7 +266,7 @@ bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
   return Res;
 }
 
-int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
+int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
   auto &l0Device = getL0Device();
   auto Image = getMemoryBuffer();
   if (identify_magic(Image.getBuffer()) == file_magic::spirv_object) {
@@ -456,7 +456,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
     const bool IsBinary = (It->second.Format == 0);
     const auto ModuleFormat =
         IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
-    std::string Options = BuildOptions;
+    std::string Options(BuildOptions);
     {
       Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
       replaceDriverOptsWithBackendOpts(l0Device, Options);
@@ -476,9 +476,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         return OFFLOAD_FAIL;
       }
     }
-
     DP("Created module from image #%" PRIu64 ".\n", Idx);
-    BuildOptions = std::move(Options);
 
     return OFFLOAD_SUCCESS;
   }
@@ -538,7 +536,6 @@ int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
 int32_t L0ProgramTy::loadModuleKernels() {
   // We need to build kernels here before filling the offload entries since we
   // don't know which module contains a specific kernel with a name.
-  std::unordered_map<std::string, ze_kernel_handle_t> ModuleKernels;
   for (auto Module : Modules) {
     uint32_t Count = 0;
     CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);

>From 69df38851b02bb7c2e8bed68af69701be9fea5cf Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Sun, 21 Sep 2025 19:25:20 +0200
Subject: [PATCH 17/70] remove more CMake messages

---
 offload/cmake/Modules/LibomptargetGetDependencies.cmake | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 240f838dbd934..3dda54e15e50f 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -96,17 +96,9 @@ find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
 
 if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
 	set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
-  message(STATUS "Could NOT find Level Zero. Missing includes.")
 else()
-  message(STATUS "Level Zero include DIR: ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}")
   set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
-  message(STATUS "Looking for Level Zero library.")
   find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
-  if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES)
-    message(STATUS "Could NOT find Level Zero. Missing library.")
-  else()
-	  message(STATUS "Level Zero library: ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES}")
-  endif()
 endif()
 
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})

>From 91fa069d973b1b0d6a9ee29950336e38adb4839a Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Sun, 21 Sep 2025 19:28:00 +0200
Subject: [PATCH 18/70] format

---
 offload/plugins-nextgen/level_zero/include/L0Device.h  | 5 +++--
 offload/plugins-nextgen/level_zero/include/L0Kernel.h  | 3 ++-
 offload/plugins-nextgen/level_zero/include/L0Program.h | 2 +-
 offload/plugins-nextgen/level_zero/src/L0Program.cpp   | 3 +--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 61e3e381c1ebb..50d8b196adfa5 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -574,8 +574,9 @@ class L0DeviceTy final : public GenericDeviceTy {
   int32_t makeMemoryResident(void *Mem, size_t Size);
 
   // Generic device interface implementation
-  Expected<DeviceImageTy *> loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
-                                           int32_t ImageId) override;
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override;
   Error unloadBinaryImpl(DeviceImageTy *Image) override;
   void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
   int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index abba4b03b01b7..507f915089427 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -60,7 +60,8 @@ struct KernelPropertiesTy {
   /// Check if we can reuse group parameters.
   bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
                         const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
-                        uint32_t *GroupSizesOut, ze_group_count_t &GroupCountsOut,
+                        uint32_t *GroupSizesOut,
+                        ze_group_count_t &GroupCountsOut,
                         bool &AllowCooperativeOut) const {
     if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
       return false;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index ca82a78653f9d..520bfa688a5af 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -73,7 +73,7 @@ class L0ProgramTy : public DeviceImageTy {
   L0ProgramTy() = delete;
 
   L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
-               std::unique_ptr<MemoryBuffer> Image)
+              std::unique_ptr<MemoryBuffer> Image)
       : DeviceImageTy(ImageId, Device, std::move(Image)) {}
 
   ~L0ProgramTy();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 3be9c6d410b43..545ad3f5bf80c 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -301,8 +301,7 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
   };
   std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
 
-  auto ExpectedNewE =
-      ELFObjectFileBase::createELFObjectFile(Image);
+  auto ExpectedNewE = ELFObjectFileBase::createELFObjectFile(Image);
   assert(ExpectedNewE &&
          "isValidOneOmpImage() returns true for invalid ELF image");
   auto processELF = [&](auto *EObj) {

>From 561e4ca05bb59cce42b61b75f77a23c4e4bd83d6 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Sun, 21 Sep 2025 19:43:46 +0200
Subject: [PATCH 19/70] remove unnecessary check

---
 offload/plugins-nextgen/level_zero/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index ecc8f11884671..04a0354cfd27a 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -1,7 +1,3 @@
-if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-  return()
-endif()
-
 # Create the library and add the default arguments.
 add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
 

>From 5b90ccb7bafc63e61f40d36a07c1f56a28dfaa3e Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 16:27:08 +0200
Subject: [PATCH 20/70] Add level_zero to liboffload platforms

---
 offload/liboffload/API/Platform.td     | 1 +
 offload/liboffload/src/OffloadImpl.cpp | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td
index 906f899076a80..9e297efc1db6e 100644
--- a/offload/liboffload/API/Platform.td
+++ b/offload/liboffload/API/Platform.td
@@ -27,6 +27,7 @@ def ol_platform_backend_t : Enum {
     Etor<"UNKNOWN", "The backend is not recognized">,
     Etor<"CUDA", "The backend is CUDA">,
     Etor<"AMDGPU", "The backend is AMDGPU">,
+    Etor<"LEVEL_ZERO", "The backend is Level Zero">,
     Etor<"HOST", "The backend is the host">,
   ];
 }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index c5d083db7522e..493d539aee315 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -231,6 +231,8 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
     return OL_PLATFORM_BACKEND_AMDGPU;
   } else if (Name == "cuda") {
     return OL_PLATFORM_BACKEND_CUDA;
+  } else if (Name == "level_zero") {
+    return OL_PLATFORM_BACKEND_LEVEL_ZERO;
   } else {
     return OL_PLATFORM_BACKEND_UNKNOWN;
   }

>From 6b8280dada27c0a6295b5a7ea9e3d7e249c5bbd0 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 16:49:58 +0200
Subject: [PATCH 21/70] fix different issues

---
 offload/CMakeLists.txt                        |  4 ++--
 .../Modules/LibomptargetGetDependencies.cmake |  2 +-
 .../level_zero/include/L0Device.h             |  1 +
 .../level_zero/include/L0Options.h            | 21 +++++++++---------
 .../level_zero/src/L0Device.cpp               | 22 ++++++++++++++++---
 5 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 50e13045947eb..4a2890e5ca741 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -185,8 +185,8 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
   endif()
 endif()
 if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
-		NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+   NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+  list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
 endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 3dda54e15e50f..44b50c40ba7c0 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -95,7 +95,7 @@ endif()
 find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
 
 if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
-	set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
 else()
   set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
   find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 50d8b196adfa5..1282a1e418183 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -403,6 +403,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
   auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
   auto getCacheSize() const { return CacheProperties.cacheSize; }
+  auto getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
 
   int32_t getAllocKind() const { return AllocKind; }
   DeviceArchTy getDeviceArch() const { return DeviceArch; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 8c79a82ef724b..2ee0a2e264a24 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -27,28 +27,26 @@ enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
 class SpecConstantsTy {
   std::vector<uint32_t> ConstantIds;
   std::vector<const void *> ConstantValues;
+  BumpPtrAllocator &Allocator;
 
 public:
-  SpecConstantsTy() = default;
+  SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
   SpecConstantsTy(const SpecConstantsTy &) = delete;
   SpecConstantsTy(SpecConstantsTy &&) = delete;
   SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
   SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
   SpecConstantsTy(const SpecConstantsTy &&Other)
       : ConstantIds(std::move(Other.ConstantIds)),
-        ConstantValues(std::move(Other.ConstantValues)) {}
+        ConstantValues(std::move(Other.ConstantValues)),
+        Allocator(Other.Allocator) {}
 
   ~SpecConstantsTy() {
-    for (auto I : ConstantValues) {
-      const char *ValuePtr = reinterpret_cast<const char *>(I);
-      delete[] ValuePtr;
-    }
   }
 
   template <typename T> void addConstant(uint32_t Id, T Val) {
-    const size_t ValSize = sizeof(Val);
-    char *ValuePtr = new char[ValSize];
-    *reinterpret_cast<T *>(ValuePtr) = Val;
+    T *ValuePtr =
+        reinterpret_cast<T *>(Allocator.Allocate(sizeof(T), alignof(T)));
+    *ValuePtr = Val;
 
     ConstantIds.push_back(Id);
     ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
@@ -134,7 +132,10 @@ struct L0OptionsTy {
 
   bool Init = false; // have the options already been processed
 
-  L0OptionsTy() {}
+  // Allocator for long-lived allocations (e.g. spec constants)
+  BumpPtrAllocator Allocator;
+
+  L0OptionsTy() : CommonSpecConstants(Allocator) {}
 
   /// Read environment variables
   void processEnvironmentVars();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index f299016e3bfa1..357c15cd349ab 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -553,10 +553,25 @@ Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
   return Plugin::success();
 }
 
+static const char *DriverVersionToStrTable[] = {
+    "1.0", "1.1", "1.2", "1.3",  "1.4",  "1.5", "1.6",
+    "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"};
+constexpr size_t DriverVersionToStrTableSize =
+    sizeof(DriverVersionToStrTable) / sizeof(DriverVersionToStrTable[0]);
+
 Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
   InfoTreeNode Info;
   Info.add("Device Number", getDeviceId());
-  Info.add("Device Name", getNameCStr());
+  Info.add("Device Name", getNameCStr(), "", DeviceInfo::NAME);
+  Info.add("Device Type", "GPU", "", DeviceInfo::TYPE);
+  Info.add("Vendor", "Intel", "", DeviceInfo::VENDOR);
+  Info.add("Vendor ID", getVendorId(), "", DeviceInfo::VENDOR_ID);
+  auto DriverVersion = getDriverAPIVersion();
+  if (DriverVersion < DriverVersionToStrTableSize)
+    Info.add("Driver Version", DriverVersionToStrTable[DriverVersion], "",
+             DeviceInfo::DRIVER_VERSION);
+  else
+    Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION);
   Info.add("Device PCI ID", getPCIId());
   Info.add("Device UUID", getUuid().data());
   Info.add("Number of total EUs", getNumEUs());
@@ -566,9 +581,10 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
   Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
   Info.add("Number of slices", getNumSlices());
   Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
-  Info.add("Global memory size (bytes)", getGlobalMemorySize());
+  Info.add("Global memory size (bytes)", getGlobalMemorySize(), "", DeviceInfo::GLOBAL_MEM_SIZE);
   Info.add("Cache size (bytes)", getCacheSize());
-  Info.add("Max clock frequency (MHz)", getClockRate());
+  Info.add("Max Memory Allocation Size (bytes)", getMaxMemAllocSize(), "", DeviceInfo::MAX_MEM_ALLOC_SIZE);
+  Info.add("Max clock frequency (MHz)", getClockRate(), "" , DeviceInfo::MAX_CLOCK_FREQUENCY);
   return Info;
 }
 

>From f983135181503fd7b9bf1867e765f37db418eaac Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 16:52:49 +0200
Subject: [PATCH 22/70] Fix ELF bits from #159623

---
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index d632d57ce3d5d..a29b15eb54d23 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -177,8 +177,7 @@ GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
 }
 
 uint16_t LevelZeroPluginTy::getMagicElfBits() const {
-  // TODO: We need to register a real ELF machine type
-  return 0x8086;
+  return ELF::EM_INTELGT;
 }
 
 Triple::ArchType LevelZeroPluginTy::getTripleArch() const {

>From b6c393a84440861c52ded2b6a4d4d5f648c91acd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 17:51:13 +0200
Subject: [PATCH 23/70] format & add more Deviceinfo tags

---
 .../plugins-nextgen/level_zero/include/L0Options.h  |  4 +---
 offload/plugins-nextgen/level_zero/src/L0Device.cpp | 13 +++++++++----
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp |  4 +---
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 2ee0a2e264a24..b2a04ae5db293 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -39,9 +39,7 @@ class SpecConstantsTy {
       : ConstantIds(std::move(Other.ConstantIds)),
         ConstantValues(std::move(Other.ConstantValues)),
         Allocator(Other.Allocator) {}
-
-  ~SpecConstantsTy() {
-  }
+  ~SpecConstantsTy() {}
 
   template <typename T> void addConstant(uint32_t Id, T Val) {
     T *ValuePtr =
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 357c15cd349ab..5fc83b7c782fb 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -574,17 +574,22 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
     Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION);
   Info.add("Device PCI ID", getPCIId());
   Info.add("Device UUID", getUuid().data());
-  Info.add("Number of total EUs", getNumEUs());
+  Info.add("Number of total EUs", getNumEUs(), "", DeviceInfo::MAX_COMPUTE_UNITS);
   Info.add("Number of threads per EU", getNumThreadsPerEU());
   Info.add("EU SIMD width", getSIMDWidth());
   Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
   Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
   Info.add("Number of slices", getNumSlices());
+  Info.add("Max Group size", getMaxGroupSize(), "",
+           DeviceInfo::MAX_GROUP_WORK_SIZE);
   Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
-  Info.add("Global memory size (bytes)", getGlobalMemorySize(), "", DeviceInfo::GLOBAL_MEM_SIZE);
+  Info.add("Global memory size (bytes)", getGlobalMemorySize(), "",
+           DeviceInfo::GLOBAL_MEM_SIZE);
   Info.add("Cache size (bytes)", getCacheSize());
-  Info.add("Max Memory Allocation Size (bytes)", getMaxMemAllocSize(), "", DeviceInfo::MAX_MEM_ALLOC_SIZE);
-  Info.add("Max clock frequency (MHz)", getClockRate(), "" , DeviceInfo::MAX_CLOCK_FREQUENCY);
+  Info.add("Max Memory Allocation Size (bytes)", getMaxMemAllocSize(), "",
+           DeviceInfo::MAX_MEM_ALLOC_SIZE);
+  Info.add("Max clock frequency (MHz)", getClockRate(), "",
+           DeviceInfo::MAX_CLOCK_FREQUENCY);
   return Info;
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index a29b15eb54d23..176e8d456a759 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -176,9 +176,7 @@ GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
   return new L0GlobalHandlerTy();
 }
 
-uint16_t LevelZeroPluginTy::getMagicElfBits() const {
-  return ELF::EM_INTELGT;
-}
+uint16_t LevelZeroPluginTy::getMagicElfBits() const { return ELF::EM_INTELGT; }
 
 Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
   return Triple::spirv64;

>From b226c70b47bc39fdd033062328d1feb8ce85b0fd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 18:08:09 +0200
Subject: [PATCH 24/70] tidy up Makefile

---
 .../plugins-nextgen/level_zero/CMakeLists.txt | 36 +++++++------------
 .../level_zero/src/L0Device.cpp               |  4 +--
 2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 04a0354cfd27a..782e53c606536 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -32,37 +32,25 @@ target_include_directories(omptarget.rtl.level_zero PRIVATE
 
 if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building Level Zero NG plugin linked against level_zero library")
-
-  if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ if(UNIX)
     target_link_libraries(omptarget.rtl.level_zero PRIVATE
-                        ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
-  elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
-    # Full path to the L0 library is recognized as a linker option, so we
+        ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY})
+  elseif(WIN32)
+    # Full path to the Level Zero library is recognized as a linker option, so we
     # separate directory and file name
-    get_filename_component(LEVEL_ZERO_LIBRARY_PATH
-                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
-    get_filename_component(LEVEL_ZERO_LIBRARY_NAME
-                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-    target_link_libraries(omptarget.rtl.level_zero PRIVATE
-                          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
-    target_link_directories(omptarget.rtl.level_zero PRIVATE 
-                            ${LEVEL_ZERO_LIBRARY_PATH})
-    target_link_options(omptarget.rtl.level_zero PRIVATE 
-                        "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
-    libomptarget_add_resource_file(omptarget.rtl.level_zero)
+    cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY PARENT_PATH LEVEL_ZERO_LIBRARY_PATH)
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_NAME}
+        ${LIBOMP_LIB_FILE})
+    target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
   else()
     message(FATAL_ERROR "Missing platform support")
   endif()
 else()
   message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
-  get_filename_component(LEVEL_ZERO_LIBRARY_NAME 
-                         ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-  if(CMAKE_SYSTEM_NAME MATCHES "Windows")
-    # Windows uses dll instead of lib files at runtime
-    string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME 
-           ${LEVEL_ZERO_LIBRARY_NAME})
+  if(WIN32)
+    cmake_path(REPLACE_EXTENSION LEVEL_ZERO_LIBRARY_NAME dll)
   endif()
-  target_compile_options(omptarget.rtl.level_zero PRIVATE 
-                         "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+  target_compile_definitions(omptarget.rtl.level_zero PRIVATE
+      LEVEL_ZERO_LIBRARY="${LEVEL_ZERO_LIBRARY_NAME}")
   target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
 endif()
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 5fc83b7c782fb..b70b3e6293b8f 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -574,14 +574,14 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
     Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION);
   Info.add("Device PCI ID", getPCIId());
   Info.add("Device UUID", getUuid().data());
-  Info.add("Number of total EUs", getNumEUs(), "", DeviceInfo::MAX_COMPUTE_UNITS);
+  Info.add("Number of total EUs", getNumEUs(), "", DeviceInfo::NUM_COMPUTE_UNITS);
   Info.add("Number of threads per EU", getNumThreadsPerEU());
   Info.add("EU SIMD width", getSIMDWidth());
   Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
   Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
   Info.add("Number of slices", getNumSlices());
   Info.add("Max Group size", getMaxGroupSize(), "",
-           DeviceInfo::MAX_GROUP_WORK_SIZE);
+           DeviceInfo::MAX_WORK_GROUP_SIZE);
   Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
   Info.add("Global memory size (bytes)", getGlobalMemorySize(), "",
            DeviceInfo::GLOBAL_MEM_SIZE);

>From af1945fcd900722ac32824b3bcd061e4f1b214fc Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 23 Sep 2025 18:29:14 +0200
Subject: [PATCH 25/70] format & renaming

---
 offload/cmake/Modules/LibomptargetGetDependencies.cmake | 6 +++---
 offload/plugins-nextgen/level_zero/CMakeLists.txt       | 5 +++--
 offload/plugins-nextgen/level_zero/src/L0Device.cpp     | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 44b50c40ba7c0..dc5ea50c958a0 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -92,13 +92,13 @@ endif()
 ################################################################################
 # Looking for Level0
 ################################################################################
-find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR NAMES level_zero/ze_api.h)
 
-if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR)
   set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
 else()
   set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
-  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
+  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY NAMES ze_loader)
 endif()
 
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 782e53c606536..44156cc5a9354 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -25,12 +25,13 @@ target_include_directories(omptarget.rtl.level_zero PRIVATE
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
   ${LIBOMPTARGET_INCLUDE_DIR}
-  ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+  ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR}
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
   ${LIBOMPTARGET_OMP_HEADER_DIR}
 )
 
-if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY FILENAME LEVEL_ZERO_LIBRARY_NAME)
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building Level Zero NG plugin linked against level_zero library")
  if(UNIX)
     target_link_libraries(omptarget.rtl.level_zero PRIVATE
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index b70b3e6293b8f..5550128c95e4c 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -574,7 +574,8 @@ Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
     Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION);
   Info.add("Device PCI ID", getPCIId());
   Info.add("Device UUID", getUuid().data());
-  Info.add("Number of total EUs", getNumEUs(), "", DeviceInfo::NUM_COMPUTE_UNITS);
+  Info.add("Number of total EUs", getNumEUs(), "",
+           DeviceInfo::NUM_COMPUTE_UNITS);
   Info.add("Number of threads per EU", getNumThreadsPerEU());
   Info.add("EU SIMD width", getSIMDWidth());
   Info.add("Number of EUs per subslice", getNumEUsPerSubslice());

>From 3dba3c3f28abee9da36d8d42c4d01e6527bfd502 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 24 Sep 2025 11:21:03 +0200
Subject: [PATCH 26/70] Remove ompt_device_t reference

---
 offload/plugins-nextgen/level_zero/include/L0Device.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 1282a1e418183..23e8717624024 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -309,9 +309,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
     return static_cast<L0DeviceTy &>(Device);
   }
-  static L0DeviceTy &makeL0Device(ompt_device_t *Device) {
-    return *static_cast<L0DeviceTy *>(Device);
-  }
 
   auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
   L0DeviceTLSTy &getTLS();

>From ac6223190eb36b58040e338d9443e0a6f9506f37 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 25 Sep 2025 14:16:44 +0200
Subject: [PATCH 27/70] address reviews

---
 .../level_zero/include/L0Memory.h                | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 194f6ab6d0f20..df29b01d46012 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -464,9 +464,9 @@ class EventPoolTy {
 
 public:
   /// Initialize context, flags, and mutex
-  void init(ze_context_handle_t _Context, uint32_t _Flags) {
-    Context = _Context;
-    Flags = _Flags;
+  void init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
+    Context = ContextIn;
+    Flags = FlagsIn;
     Mtx.reset(new std::mutex);
   }
 
@@ -496,7 +496,7 @@ class StagingBufferTy {
   /// Number of buffers allocated together
   size_t Count = L0StagingBufferCount;
   /// Buffers increasing by Count if a new buffer is required
-  std::list<void *> Buffers;
+  llvm::SmallVector<void *> Buffers;
   /// Next buffer location in the buffers
   size_t Offset = 0;
 
@@ -533,10 +533,10 @@ class StagingBufferTy {
 
   bool initialized() const { return Context != nullptr; }
 
-  void init(ze_context_handle_t _Context, size_t _Size, size_t _Count) {
-    Context = _Context;
-    Size = _Size;
-    Count = _Count;
+  void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
+    Context = ContextIn;
+    Size = SizeIn;
+    Count = CountIn;
   }
 
   void reset() { Offset = 0; }

>From 17c8e310f57ea9b3124fef55144641117eb618f3 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 25 Sep 2025 15:04:52 +0200
Subject: [PATCH 28/70] remove unused argument

---
 .../plugins-nextgen/level_zero/include/L0Device.h  |  2 +-
 .../plugins-nextgen/level_zero/src/L0Device.cpp    | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 23e8717624024..163eb459427af 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -504,7 +504,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   /// Enqueue copy command
   int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
                          __tgt_async_info *AsyncInfo = nullptr,
-                         bool Locked = false, bool UseCopyEngine = true);
+                         bool UseCopyEngine = true);
 
   /// Enqueue asynchronous copy command
   int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 5550128c95e4c..3a165b042d64c 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -529,8 +529,7 @@ Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
       return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
   } else {
     if (enqueueMemCopy(DstPtr, SrcPtr, Size,
-                       /* AsyncInfo */ nullptr,
-                       /* Locked */ false, UseCopyEngine))
+                       /* AsyncInfo */ nullptr, UseCopyEngine))
       return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
   }
   return Plugin::success();
@@ -693,7 +692,7 @@ Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
 }
 
 int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
-                                   __tgt_async_info *AsyncInfo, bool Locked,
+                                   __tgt_async_info *AsyncInfo,
                                    bool UseCopyEngine) {
   ze_command_list_handle_t CmdList = nullptr;
   ze_command_queue_handle_t CmdQueue = nullptr;
@@ -717,13 +716,8 @@ int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
     CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
                      Event, 0, nullptr);
     CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
-    if (Locked) {
-      CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
-                       nullptr);
-    } else {
-      CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
-                           CmdQueue, 1, &CmdList, nullptr);
-    }
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
     CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
     CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
   }

>From b8591792fa44cf97bd475a0ac38deb81a168cb53 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Fri, 26 Sep 2025 10:47:28 +0200
Subject: [PATCH 29/70] cleanup

---
 .../level_zero/include/L0Kernel.h              |  2 +-
 .../plugins-nextgen/level_zero/include/TLS.h   |  7 -------
 .../level_zero/src/L0Device.cpp                |  2 +-
 .../level_zero/src/L0Kernel.cpp                | 18 +++++++++---------
 4 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 507f915089427..c5a3528dd2974 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -147,7 +147,7 @@ class L0KernelTy : public GenericKernelTy {
     return zeKernel;
   }
 
-  int32_t getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+  int32_t getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                          int32_t ThreadLimit, uint32_t *GroupSizes,
                          ze_group_count_t &GroupCounts, void *LoopDesc,
                          bool &AllowCooperative) const;
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
index 46086ee4b6d19..257ada0b33b37 100644
--- a/offload/plugins-nextgen/level_zero/include/TLS.h
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -25,9 +25,6 @@ namespace plugin {
 
 /// All thread-local data used by the Plugin
 class L0ThreadTLSTy {
-  /// Subdevice encoding
-  int64_t SubDeviceCode = 0;
-
   /// Async info tracking
   static constexpr int32_t PerThreadQueues = 10;
   AsyncQueueTy AsyncQueues[PerThreadQueues];
@@ -43,10 +40,6 @@ class L0ThreadTLSTy {
 
   void clear() {}
 
-  int64_t getSubDeviceCode() { return SubDeviceCode; }
-
-  void setSubDeviceCode(int64_t Code) { SubDeviceCode = Code; }
-
   AsyncQueueTy *getAsyncQueue() {
     AsyncQueueTy *ret = nullptr;
     if (UsedQueues < PerThreadQueues) {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 3a165b042d64c..e28734a32d1f5 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -445,7 +445,7 @@ Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
                          " not implemented yet\n",
                          __func__);
   }
-  int32_t RC = synchronize(&AsyncInfo);
+  int32_t RC = synchronize(&AsyncInfo, ReleaseQueue);
   return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 24e8e07e460ea..17e349d28617e 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -397,13 +397,13 @@ int32_t L0KernelTy::decideLoopKernelGroupArguments(
   return OFFLOAD_SUCCESS;
 }
 
-int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                                    int32_t ThreadLimit, uint32_t *GroupSizes,
                                    ze_group_count_t &GroupCounts,
                                    void *LoopDesc,
                                    bool &AllowCooperative) const {
 
-  const auto SubId = SubDevice.getDeviceId();
+  const auto DeviceId = Device.getDeviceId();
   const auto &KernelPR = getProperties();
 
   // Detect if we need to reduce available HW threads. We need this adjustment
@@ -419,13 +419,13 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
   // Read the most recent global thread limit and max teams.
   auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
 
-  bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+  bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
   bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
   uint32_t KernelWidth = KernelPR.Width;
   uint32_t SIMDWidth = KernelPR.SIMDWidth;
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
   assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
 
@@ -439,10 +439,10 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
     DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
   }
 
-  size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+  size_t MaxThreadLimit = Device.getMaxGroupSize();
   // Set correct max group size if the kernel was compiled with explicit SIMD
   if (SIMDWidth == 1) {
-    MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+    MaxThreadLimit = Device.getNumThreadsPerSubslice();
   }
 
   if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
@@ -463,7 +463,7 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
          NumTeams);
     } else if (NumTeamsICV > 0) {
       // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
            "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
 
       NumTeams = NumTeamsICV;
@@ -473,7 +473,7 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
 
     bool UseLoopTC = LoopDesc;
     decideKernelGroupArguments(
-        SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+        Device, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
         UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
         GroupCounts, HalfNumThreads, false);
     AllowCooperative = false;

>From f1d1aad06732be0fa634459c14b035441ade8aa9 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Fri, 26 Sep 2025 17:16:57 +0200
Subject: [PATCH 30/70] [OFFLOAD] Enable level zero testing

---
 offload/CMakeLists.txt                        |  1 +
 .../Modules/LibomptargetGetDependencies.cmake | 21 +++++++++++++++++++
 .../plugins-nextgen/level_zero/CMakeLists.txt | 12 +++++++++++
 3 files changed, 34 insertions(+)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 4a2890e5ca741..03567b41d41c4 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -226,6 +226,7 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} riscv64-unknown-linux
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} riscv64-unknown-linux-gnu-LTO")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} loongarch64-unknown-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} loongarch64-unknown-linux-gnu-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} spirv64")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index dc5ea50c958a0..9afafc439b728 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -42,6 +42,16 @@ endif()
 find_package(FFI QUIET)
 set(LIBOMPTARGET_DEP_LIBFFI_FOUND ${FFI_FOUND})
 
+################################################################################
+# Looking for offload-arch...
+################################################################################
+if(TARGET offload-arch)
+  get_property(LIBOMPTARGET_OFFLOAD_ARCH TARGET offload-arch PROPERTY LOCATION)
+else()
+  find_program(LIBOMPTARGET_OFFLOAD_ARCH NAMES offload-arch
+               PATHS ${LLVM_TOOLS_BINARY_DIR})
+endif()
+
 ################################################################################
 # Looking for NVIDIA GPUs...
 ################################################################################
@@ -101,4 +111,15 @@ else()
   find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY NAMES ze_loader)
 endif()
 
+if(LIBOMPTARGET_OFFLOAD_ARCH)
+  execute_process(COMMAND ${LIBOMPTARGET_OFFLOAD_ARCH} "--only=intel"
+                  OUTPUT_VARIABLE LIBOMPTARGET_INTELGPU_ARCH_OUTPUT
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  string(REPLACE "\n" ";" intelgpu_arch_list "${LIBOMPTARGET_INTELGPU_ARCH_OUTPUT}")
+  if(intelgpu_arch_list)
+    set(LIBOMPTARGET_FOUND_INTELGPU_GPU TRUE)
+    set(LIBOMPTARGET_INTELGPU_DETECTED_ARCH_LIST "${intelgpu_arch_list}")
+  endif()
+endif()
+
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 44156cc5a9354..d42a2bcc83afa 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -55,3 +55,15 @@ else()
       LEVEL_ZERO_LIBRARY="${LEVEL_ZERO_LIBRARY_NAME}")
   target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
 endif()
+
+option(LIBOMPTARGET_FORCE_LEVELZERO_TESTS "Build Level Zero libomptarget tests" OFF)
+if (LIBOMPTARGET_FOUND_INTELGPU_GPU OR LIBOMPTARGET_FORCE_LEVELZERO_TESTS)
+  # Report to the parent scope that we are building a plugin for intelgpu
+  set(LIBOMPTARGET_SYSTEM_TARGETS
+      "${LIBOMPTARGET_SYSTEM_TARGETS} spirv64" PARENT_SCOPE)
+  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.level_zero")
+  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+else()
+  message(STATUS "Not generating Level Zero tests, no supported devices detected."
+                   " Use 'LIBOMPTARGET_FORCE_INTELGPU_TESTS' to override.")
+endif()

>From c141f9d005288cdcc913f9302a18b4b5c82a6eaf Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 12:56:39 +0200
Subject: [PATCH 31/70] address review

---
 .../level_zero/include/AsyncQueue.h               |  2 +-
 .../plugins-nextgen/level_zero/include/L0Defs.h   |  4 +---
 .../level_zero/include/L0Options.h                |  4 ++++
 .../plugins-nextgen/level_zero/src/L0Kernel.cpp   | 15 +++------------
 .../plugins-nextgen/level_zero/src/L0Options.cpp  |  3 +++
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index b07e0a7790f6b..a087a082639e4 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -24,7 +24,7 @@ namespace plugin {
 
 /// Abstract queue that supports asynchronous command submission
 struct AsyncQueueTy {
-  /// List of events attahced to submitted commands
+  /// List of events attached to submitted commands
   llvm::SmallVector<ze_event_handle_t> WaitEvents;
   /// Pending staging buffer to host copies
   llvm::SmallVector<std::tuple<void *, void *, size_t>> H2MList;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 66d38cd7b9eb5..b0e56227fa809 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -30,9 +30,7 @@ enum class AllocOptionTy : int32_t {
 /// Host runtime routines being used
 extern "C" {
 LIBOMP_DECL(int, omp_get_max_teams(void));
-LIBOMP_DECL(int, omp_get_thread_limit(void));
 LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
-LIBOMP_DECL(double, omp_get_wtime(void));
 } // extern "C"
 
 #ifndef EXTRACT_BITS
@@ -69,7 +67,7 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
 }
 
 #define L0_UNIMPLEMENTED_ERR                                                   \
-  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet\n",   \
+  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",     \
                        __func__);
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b2a04ae5db293..c8531fc50d6ff 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -128,6 +128,10 @@ struct L0OptionsTy {
   /// devices and whether immediate command list is fully enabled.
   CommandModeTy CommandMode = CommandModeTy::Async;
 
+  // Controlso if we need to reduce available HW threads. We need this adjustment
+  // on XeHPG when Level Zero debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  bool ZeDebugEnabled = false;
+
   bool Init = false; // have the options already been processed
 
   // Allocator for long-lived allocations (e.g. spec constants)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 17e349d28617e..6f76776296297 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -61,7 +61,7 @@ static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
   int32_t ThreadLimit;
   ThreadLimit = omp_get_teams_thread_limit();
   DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThreadLimit);
-  // omp_get_thread_limit() would return INT_MAX by default.
+  // omp_get_teams_thread_limit() would return INT_MAX by default.
   // NOTE: Windows.h defines max() macro, so we have to guard
   //       the call with parentheses.
   ThreadLimit =
@@ -406,21 +406,12 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
   const auto DeviceId = Device.getDeviceId();
   const auto &KernelPR = getProperties();
 
-  // Detect if we need to reduce available HW threads. We need this adjustment
-  // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
-  static std::once_flag OnceFlag;
-  static bool ZeDebugEnabled = false;
-  std::call_once(OnceFlag, []() {
-    const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
-    if (EnvVal && std::atoi(EnvVal) == 1)
-      ZeDebugEnabled = true;
-  });
-
   // Read the most recent global thread limit and max teams.
   auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
 
   bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
-  bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+  bool HalfNumThreads =
+      LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
   uint32_t KernelWidth = KernelPR.Width;
   uint32_t SIMDWidth = KernelPR.SIMDWidth;
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 7229e2498ae13..2e2c2cd5a5bbf 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -172,6 +172,9 @@ void L0OptionsTy::processEnvironmentVars() {
       INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
                      CommandModeVar.get().c_str());
   }
+
+  // Detect if we need to enable compatibility with Level Zero debug mode.
+  ZeDebugEnabled = BoolEnvar("ZET_ENABLE_PROGRAM_DEBUGGING", false);
 }
 
 } // namespace llvm::omp::target::plugin

>From dd774b6c430f7e2bd5d900d24f6185b35c9bebad Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 13:00:33 +0200
Subject: [PATCH 32/70] fix & remove dead code

---
 offload/plugins-nextgen/level_zero/include/L0Device.h | 1 -
 offload/plugins-nextgen/level_zero/src/L0Device.cpp   | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 163eb459427af..35997b0aefd12 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -445,7 +445,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   bool useImmForCompute() const { return true; }
   bool useImmForCopy() const { return true; }
   bool useImmForInterop() const { return true; }
-  bool forceInorderInterop() const { return true; }
 
   void reportDeviceInfo() const;
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index e28734a32d1f5..e31fa30305c68 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -627,9 +627,13 @@ uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
 interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
                                                    int32_t NumPrefers,
                                                    interop_spec_t *Prefers) {
-  // no supported preference found, set default to level_zero, non-ordered
+  // no supported preference found, set default to level_zero,
+  // non-ordered unless is targetsync
   return interop_spec_t{
-      tgt_fr_level_zero, {forceInorderInterop() /*inorder*/, 0}, 0};
+      tgt_fr_level_zero,
+      {InteropType == kmp_interop_type_targetsync ? true : false /*inorder*/,
+       0},
+      0};
 }
 
 Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,

>From 7fa9dda9f9f9d9a02919a4d220c3bbff789ca8cd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 13:15:46 +0200
Subject: [PATCH 33/70] remove support for target ICVs

---
 .../plugins-nextgen/level_zero/CMakeLists.txt |  3 -
 .../level_zero/include/L0Defs.h               |  6 --
 .../level_zero/src/L0Kernel.cpp               | 29 +-------
 .../level_zero/src/OmpWrapper.cpp             | 71 -------------------
 4 files changed, 1 insertion(+), 108 deletions(-)
 delete mode 100644 offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 44156cc5a9354..719e46b03edaf 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -11,9 +11,6 @@ set(LEVEL_ZERO_SRC_FILES
   src/L0Program.cpp
   src/L0Options.cpp
 )
-list(APPEND LEVEL_ZERO_SRC_FILES
-  src/OmpWrapper.cpp
-)
 
 target_sources(omptarget.rtl.level_zero PRIVATE
   ${LEVEL_ZERO_SRC_FILES}
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index b0e56227fa809..868729c117955 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -27,12 +27,6 @@ enum class AllocOptionTy : int32_t {
   ALLOC_OPT_SLM = 4,
 };
 
-/// Host runtime routines being used
-extern "C" {
-LIBOMP_DECL(int, omp_get_max_teams(void));
-LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
-} // extern "C"
-
 #ifndef EXTRACT_BITS
 // MSB=63, LSB=0
 #define EXTRACT_BITS(I64, HIGH, LOW)                                           \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 6f76776296297..53642eba20475 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -54,33 +54,6 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
   return Plugin::success();
 }
 
-/// Read global thread limit and max teams from the host runtime. These values
-/// are subject to change at any program point, so every kernel execution
-/// needs to read the most recent values.
-static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
-  int32_t ThreadLimit;
-  ThreadLimit = omp_get_teams_thread_limit();
-  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThreadLimit);
-  // omp_get_teams_thread_limit() would return INT_MAX by default.
-  // NOTE: Windows.h defines max() macro, so we have to guard
-  //       the call with parentheses.
-  ThreadLimit =
-      (ThreadLimit > 0 && ThreadLimit != (std::numeric_limits<int32_t>::max)())
-          ? ThreadLimit
-          : 0;
-
-  int NTeams = omp_get_max_teams();
-  DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
-  // omp_get_max_teams() would return INT_MAX by default.
-  // NOTE: Windows.h defines max() macro, so we have to guard
-  //       the call with parentheses.
-  int32_t NumTeams =
-      (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
-                                                                      : 0;
-
-  return {NumTeams, ThreadLimit};
-}
-
 void L0KernelTy::decideKernelGroupArguments(
     L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
     TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
@@ -407,7 +380,7 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
   const auto &KernelPR = getProperties();
 
   // Read the most recent global thread limit and max teams.
-  auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+  const auto [NumTeamsICV, ThreadLimitICV] = std::make_tuple(0, 0);
 
   bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
   bool HalfNumThreads =
diff --git a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
deleted file mode 100644
index 3721d686393bd..0000000000000
--- a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//===--- level_zero/src/OmpWrapper.cpp --------------------------- C++ -*-===//
-//
-// Implement wrapper for OpenMP compatibility through dlopen
-//
-//===----------------------------------------------------------------------===//
-
-#include "DLWrap.h"
-#include "Shared/Debug.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "L0Defs.h"
-
-DLWRAP_INITIALIZE()
-
-DLWRAP_INTERNAL(omp_get_max_teams, 0)
-DLWRAP_INTERNAL(omp_get_teams_thread_limit, 0)
-
-DLWRAP_FINALIZE()
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-static bool loadOpenMP() {
-  static bool Loaded{false};
-  if (Loaded)
-    return true;
-
-  const char *OpenMPLibrary = "libomp.so";
-  std::string ErrMsg;
-
-  DP("Trying to load %s\n", OpenMPLibrary);
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(OpenMPLibrary, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    if (ErrMsg.empty())
-      ErrMsg = "unknown error";
-    DP("Unable to load library '%s': %s!\n", OpenMPLibrary, ErrMsg.c_str());
-    return false;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, OpenMPLibrary);
-      return false;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-  return true;
-}
-
-int omp_get_max_teams() {
-  if (!loadOpenMP())
-    return 0;
-  return dlwrap_omp_get_max_teams();
-}
-
-int omp_get_teams_thread_limit() {
-  if (!loadOpenMP())
-    return 0;
-  return dlwrap_omp_get_teams_thread_limit();
-}

>From 96411ce3bcc4dd3cdb98d72bf1a91fa6c96e39ea Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 14:06:40 +0200
Subject: [PATCH 34/70] typo & format

---
 offload/plugins-nextgen/level_zero/include/L0Options.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index c8531fc50d6ff..459eef312f076 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -120,7 +120,7 @@ struct L0OptionsTy {
       "-cl-take-global-address";
   std::string UserCompilationOptions = "";
 
-  // Spec constants used for all modules.
+  /// Spec constants used for all modules.
   SpecConstantsTy CommonSpecConstants;
 
   /// Command execution mode.
@@ -128,8 +128,9 @@ struct L0OptionsTy {
   /// devices and whether immediate command list is fully enabled.
   CommandModeTy CommandMode = CommandModeTy::Async;
 
-  // Controlso if we need to reduce available HW threads. We need this adjustment
-  // on XeHPG when Level Zero debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  /// Controls if we need to reduce available HW threads. We need this
+  /// adjustment on XeHPG when Level Zero debug is enabled
+  /// (ZET_ENABLE_PROGRAM_DEBUGGING=1).
   bool ZeDebugEnabled = false;
 
   bool Init = false; // have the options already been processed

>From 6c5278ff3a4677c636e82f373d1b1701a945d676 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 15:10:50 +0200
Subject: [PATCH 35/70] Adjust interfaces after merge

---
 .../level_zero/include/L0Device.h                | 16 +++++++++-------
 .../plugins-nextgen/level_zero/src/L0Device.cpp  | 16 +++++++++-------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 35997b0aefd12..96ddcbbb44e40 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -541,13 +541,14 @@ class L0DeviceTy final : public GenericDeviceTy {
   // Allocation related routines
 
   /// Data alloc
-  void *dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
-                  bool UserAlloc, bool DevMalloc = false,
-                  uint32_t MemAdvice = UINT32_MAX,
-                  AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+  Expected<void *>
+  dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+            bool UserAlloc, bool DevMalloc = false,
+            uint32_t MemAdvice = UINT32_MAX,
+            AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
 
   /// Data delete
-  int32_t dataDelete(void *Ptr);
+  Error dataDelete(void *Ptr);
 
   /// Return the memory allocation type for the specified memory location.
   uint32_t getMemAllocType(const void *Ptr) const;
@@ -575,8 +576,9 @@ class L0DeviceTy final : public GenericDeviceTy {
   loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
                  int32_t ImageId) override;
   Error unloadBinaryImpl(DeviceImageTy *Image) override;
-  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
-  int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+  Expected<void *> allocate(size_t Size, void *HstPtr,
+                            TargetAllocTy Kind) override;
+  Error free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
 
   Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
     return Plugin::error(error::ErrorCode::UNKNOWN,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index e31fa30305c68..b2cd08fc90fb4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -492,13 +492,14 @@ Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
   return Plugin::success();
 }
 
-void *L0DeviceTy::allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) {
+Expected<void *> L0DeviceTy::allocate(size_t Size, void *HstPtr,
+                                      TargetAllocTy Kind) {
   return dataAlloc(Size, /*Align=*/0, Kind,
                    /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
                    /*DevMalloc=*/false);
 }
 
-int L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+Error L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
   return dataDelete(TgtPtr);
 }
 
@@ -790,9 +791,10 @@ Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
   return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
 }
 
-void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
-                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
-                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+Expected<void *> L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+                                       intptr_t Offset, bool UserAlloc,
+                                       bool DevMalloc, uint32_t MemAdvice,
+                                       AllocOptionTy AllocOpt) {
 
   const bool UseDedicatedPool =
       (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
@@ -812,9 +814,9 @@ void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
                          MemAdvice, AllocOpt);
 }
 
-int32_t L0DeviceTy::dataDelete(void *Ptr) {
+Error L0DeviceTy::dataDelete(void *Ptr) {
   auto &Allocator = getMemAllocator(Ptr);
-  return Allocator.dealloc(Ptr);
+  return Plugin::check(Allocator.dealloc(Ptr), "Error deleting ptr");
 }
 
 int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {

>From 44b2f48c80b97b41250049e0cd4f3de69afdf663 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 30 Sep 2025 18:36:48 +0200
Subject: [PATCH 36/70] push some error checking down

---
 .../level_zero/include/L0Memory.h             | 10 ++---
 .../level_zero/src/L0Device.cpp               |  2 +-
 .../level_zero/src/L0Memory.cpp               | 38 +++++++++++--------
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index df29b01d46012..f36047e645fc6 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -307,7 +307,7 @@ class MemAllocatorTy {
   bool IsHostMem = false;
   // Internal deallocation function to be called when already
   // hondling the Mtx lock
-  int32_t dealloc_locked(void *Ptr);
+  Error dealloc_locked(void *Ptr);
 
 public:
   MemAllocatorTy() = default;
@@ -337,12 +337,12 @@ class MemAllocatorTy {
   void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
 
   /// Allocate memory with the specified information from a memory pool
-  void *alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
-              bool UserAlloc, bool DevMalloc, uint32_t MemAdvice,
-              AllocOptionTy AllocOpt);
+  Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
+                         intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                         uint32_t MemAdvice, AllocOptionTy AllocOpt);
 
   /// Deallocate memory
-  int32_t dealloc(void *Ptr) {
+  Error dealloc(void *Ptr) {
     std::lock_guard<std::mutex> Lock(Mtx);
     return dealloc_locked(Ptr);
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index b2cd08fc90fb4..6b13aa5219cef 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -816,7 +816,7 @@ Expected<void *> L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
 
 Error L0DeviceTy::dataDelete(void *Ptr) {
   auto &Allocator = getMemAllocator(Ptr);
-  return Plugin::check(Allocator.dealloc(Ptr), "Error deleting ptr");
+  return Allocator.dealloc(Ptr);
 }
 
 int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index fe80783c699d2..cd23bb39d167a 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -405,8 +405,11 @@ void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
 void MemAllocatorTy::deinit() {
   std::lock_guard<std::mutex> Lock(Mtx);
   // Release RTL-owned memory
-  for (auto *M : MemOwned)
-    dealloc_locked(M);
+  for (auto *M : MemOwned) {
+    auto Err = dealloc_locked(M);
+    if (Err)
+      consumeError(std::move(Err));
+  }
   // Release resources used in the pool
   Pools.clear();
   ReductionPool.reset(nullptr);
@@ -436,9 +439,10 @@ void MemAllocatorTy::deinit() {
 }
 
 /// Allocate memory with the specified information
-void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
-                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
-                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+                                       intptr_t Offset, bool UserAlloc,
+                                       bool DevMalloc, uint32_t MemAdvice,
+                                       AllocOptionTy AllocOpt) {
   assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
           Kind == TARGET_ALLOC_SHARED) &&
          "Unknown memory kind while allocating target memory");
@@ -503,12 +507,13 @@ void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
 }
 
 /// Deallocate memory
-int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
+Error MemAllocatorTy::dealloc_locked(void *Ptr) {
   MemAllocInfoTy Info;
   if (!AllocInfo.remove(Ptr, &Info)) {
-    DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
-       DPxPTR(Ptr));
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::BACKEND_FAILURE,
+                         "Cannot find memory allocation information for " DPxMOD
+                         "\n",
+                         DPxPTR(Ptr));
   }
   if (Info.InPool) {
     size_t DeallocSize = 0;
@@ -521,24 +526,27 @@ int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
       if (DeallocSize == 0)
         DeallocSize = CounterPool->dealloc(Info.Base);
       if (DeallocSize == 0) {
-        DP("Error: Cannot return memory " DPxMOD " to pool\n", DPxPTR(Ptr));
-        return OFFLOAD_FAIL;
+        return Plugin::error(ErrorCode::BACKEND_FAILURE,
+                             "Cannot return memory " DPxMOD " to pool\n",
+                             DPxPTR(Ptr));
       }
     }
     log(0, DeallocSize, Info.Kind, true /* Pool */);
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
   if (!Info.Base) {
     DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Cannot find base address of " DPxMOD "\n",
+                         DPxPTR(Ptr));
   }
-  CALL_ZE_RET_FAIL(zeMemFree, L0Context->getZeContext(), Info.Base);
+  CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Info.Base);
   log(0, Info.Size, Info.Kind);
 
   DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
      DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
 
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,

>From b4e238f41e5bb8aeee5c156ac4d15bb92b9a00a0 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 9 Oct 2025 11:36:22 +0200
Subject: [PATCH 37/70] Update
 offload/plugins-nextgen/level_zero/include/L0Device.h

Co-authored-by: Nick Sarnie <nick.sarnie at intel.com>
---
 offload/plugins-nextgen/level_zero/include/L0Device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 96ddcbbb44e40..e6ebff0305a14 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -288,7 +288,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
              ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
              const std::string_view zeId, int32_t ComputeIndex)
-      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, SPIRVGridValues),
         l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
         ComputeIndex(ComputeIndex) {
     DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;

>From 166028d91c0f454ca174e6f64f7ebc48948c0c66 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 9 Oct 2025 11:49:03 +0200
Subject: [PATCH 38/70] address review comments

---
 .../plugins-nextgen/level_zero/include/L0Memory.h    |  6 ++++--
 offload/plugins-nextgen/level_zero/src/L0Device.cpp  |  8 +++-----
 offload/plugins-nextgen/level_zero/src/L0Memory.cpp  |  7 +++++--
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp  | 11 ++++++-----
 offload/plugins-nextgen/level_zero/src/L0Program.cpp | 12 ++++++++++--
 5 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index f36047e645fc6..9b02aa8568f96 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -200,8 +200,6 @@ class MemAllocatorTy {
     std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
     /// Need to zero-initialize after L0 allocation
     bool ZeroInit = false;
-    /// Zero-initialized values to be copied to device
-    std::vector<char> ZeroInitValue;
 
     /// Get bucket ID from the specified allocation size.
     uint32_t getBucketId(size_t Size) {
@@ -394,6 +392,10 @@ class MemAllocatorTy {
 
   /// Perform copy operation
   int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+
+  /// Perform memory fill operation
+  int32_t enqueueMemSet(void *Dst, int8_t Value, size_t Size);
+
 }; /// MemAllocatorTy
 
 // simple generic wrapper to reuse objects
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 6b13aa5219cef..715de0d1b3c12 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -701,14 +701,12 @@ int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
                                    bool UseCopyEngine) {
   ze_command_list_handle_t CmdList = nullptr;
   ze_command_queue_handle_t CmdQueue = nullptr;
-  ze_event_handle_t Event = nullptr;
 
   if (useImmForCopy()) {
     CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
-    Event = getEvent();
     CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
-                     Event, 0, nullptr);
-    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+                     nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListHostSynchronize, CmdList, UINT64_MAX);
   } else {
     if (UseCopyEngine) {
       CmdList = getCopyCmdList();
@@ -719,7 +717,7 @@ int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
     }
 
     CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
-                     Event, 0, nullptr);
+                     nullptr, 0, nullptr);
     CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
     CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
                          CmdQueue, 1, &CmdList, nullptr);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index cd23bb39d167a..d7b8601093a86 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -174,7 +174,6 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
   BucketStats.resize(1, {0, 0});
   BucketParams.emplace_back(AllocMax, AllocUnit);
   ZeroInit = true;
-  ZeroInitValue.resize(AllocUnit, 0);
   DP("Initialized zero-initialized reduction counter pool for "
      "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
      DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
@@ -266,7 +265,7 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
 
     if (ZeroInit) {
       auto RC =
-          Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
+          Allocator->enqueueMemSet(Base, 0, BlockSize);
       if (RC != OFFLOAD_SUCCESS) {
         DP("Failed to zero-initialize pool memory\n");
         return nullptr;
@@ -549,6 +548,10 @@ Error MemAllocatorTy::dealloc_locked(void *Ptr) {
   return Plugin::success();
 }
 
+int32_t MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) {
+  return Device->enqueueMemFill(Dst, &Value, sizeof(int8_t), Size);
+}
+
 int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
                                        size_t Size) {
   return Device->enqueueMemCopy(Dst, Src, Size);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 176e8d456a759..b134fb659738e 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -214,12 +214,8 @@ Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
        " with ImmCmdList barrier\n",
        DPxPTR(Interop));
     auto ImmCmdList = L0->ImmCmdList;
-    auto Event = l0Device.getEvent();
 
-    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, Event, 0,
-                      nullptr);
-    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
-    l0Device.releaseEvent(Event);
+    CALL_ZE_RET_ERROR(zeCommandListHostSynchronize, ImmCmdList, UINT64_MAX);
   } else {
     DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
        " with queue synchronize\n",
@@ -254,6 +250,8 @@ Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
     CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
                       nullptr);
   } else {
+#if 0
+    // TODO: re-enable once we have a way to delay the CmdList reset 
     DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
        "\n",
        DPxPTR(Interop));
@@ -264,6 +262,9 @@ Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
     CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
                       nullptr);
     CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+#else
+    return syncBarrierImpl(Interop);
+#endif
   }
 
   return Plugin::success();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 545ad3f5bf80c..9ca0bcc1b16dd 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -352,7 +352,11 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
           }
 
           uint64_t Idx = 0;
-          Parts[0].getAsInteger(10, Idx);
+          if (Parts[0].getAsInteger(10, Idx)) {
+            DP("Warning: ignoring auxiliary information (invalid index '%s').\n",
+               Parts[0].str().c_str());
+            continue;
+          }
           MaxImageIdx = (std::max)(MaxImageIdx, Idx);
           if (AuxInfo.find(Idx) != AuxInfo.end()) {
             DP("Warning: duplicate auxiliary information for image %" PRIu64
@@ -362,7 +366,11 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
           }
 
           uint64_t Part1Id;
-          Parts[1].getAsInteger(10, Part1Id);
+          if (Parts[1].getAsInteger(10, Part1Id)) {
+            DP("Warning: ignoring auxiliary information (invalid part id '%s').\n",
+               Parts[1].str().c_str());
+            continue;
+          }
 
           AuxInfo.emplace(
               std::piecewise_construct, std::forward_as_tuple(Idx),

>From 6629159761cdc894a53cb06a8da723ef3d31dffb Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 9 Oct 2025 12:13:20 +0200
Subject: [PATCH 39/70] format

---
 offload/plugins-nextgen/level_zero/src/L0Program.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 9ca0bcc1b16dd..0cd88b4ea97ef 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -353,7 +353,8 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
 
           uint64_t Idx = 0;
           if (Parts[0].getAsInteger(10, Idx)) {
-            DP("Warning: ignoring auxiliary information (invalid index '%s').\n",
+            DP("Warning: ignoring auxiliary information (invalid index "
+               "'%s').\n",
                Parts[0].str().c_str());
             continue;
           }
@@ -367,7 +368,8 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
 
           uint64_t Part1Id;
           if (Parts[1].getAsInteger(10, Part1Id)) {
-            DP("Warning: ignoring auxiliary information (invalid part id '%s').\n",
+            DP("Warning: ignoring auxiliary information (invalid part id "
+               "'%s').\n",
                Parts[1].str().c_str());
             continue;
           }

>From c00c288e557c806c056ffee405b3614807d1c8bb Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 9 Oct 2025 12:27:11 +0200
Subject: [PATCH 40/70] more format :/

---
 offload/plugins-nextgen/level_zero/src/L0Memory.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index d7b8601093a86..c26e3fb328645 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -264,8 +264,7 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
     void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
 
     if (ZeroInit) {
-      auto RC =
-          Allocator->enqueueMemSet(Base, 0, BlockSize);
+      auto RC = Allocator->enqueueMemSet(Base, 0, BlockSize);
       if (RC != OFFLOAD_SUCCESS) {
         DP("Failed to zero-initialize pool memory\n");
         return nullptr;

>From 93d394869d22d1619f535db2f3cbb0e7841f8e35 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 9 Oct 2025 18:25:48 +0200
Subject: [PATCH 41/70] add missed wrapper; remove unused macro

---
 offload/plugins-nextgen/level_zero/include/L0Defs.h     | 2 --
 offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 868729c117955..47dc25b85ce92 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -17,8 +17,6 @@
 #include "Shared/Requirements.h"
 #include "omptarget.h"
 
-#define LIBOMP_DECL(RetType, FnDecl) RetType __cdecl FnDecl
-
 enum class AllocOptionTy : int32_t {
   ALLOC_OPT_NONE = 0,
   ALLOC_OPT_REDUCTION_SCRATCH = 1,
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
index 06f01f23285fc..e1ee9d5fa033b 100644
--- a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -83,6 +83,7 @@ DLWRAP(zeModuleDynamicLink, 3)
 DLWRAP(zeModuleGetGlobalPointer, 4)
 DLWRAP(zesDeviceEnumMemoryModules, 3)
 DLWRAP(zesMemoryGetState, 2)
+DLWRAP(zeCommandListHostSynchronize, 2)
 
 DLWRAP_FINALIZE()
 

>From 4a2140e8701d950a0e5bb593bef50baaec4289ff Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 15 Oct 2025 16:38:01 +0200
Subject: [PATCH 42/70] Remove level_zero from default plugin list

---
 offload/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 4a2890e5ca741..7aa203999c60e 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,7 +150,7 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
   message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
 endif()
 
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
 set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
     "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
 

>From bd43212f3b1a54ad879d4765b2baa9ca4340e317 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 15 Oct 2025 22:30:40 +0200
Subject: [PATCH 43/70] remove removed plugin interface

---
 offload/plugins-nextgen/level_zero/include/L0Device.h | 1 -
 offload/plugins-nextgen/level_zero/src/L0Device.cpp   | 8 --------
 2 files changed, 9 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index e6ebff0305a14..3c6f6cb89cf2c 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -607,7 +607,6 @@ class L0DeviceTy final : public GenericDeviceTy {
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
-  Error initDeviceInfoImpl(__tgt_device_info *Info) override;
   Expected<bool>
   hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 715de0d1b3c12..d2938882d6e76 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -545,14 +545,6 @@ Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
   return Plugin::success();
 }
 
-Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
-  if (!Info->Context)
-    Info->Context = getZeContext();
-  if (!Info->Device)
-    Info->Device = reinterpret_cast<void *>(getZeDevice());
-  return Plugin::success();
-}
-
 static const char *DriverVersionToStrTable[] = {
     "1.0", "1.1", "1.2", "1.3",  "1.4",  "1.5", "1.6",
     "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"};

>From 7f9c0c872cab47f38257b372d6e398642c7ed6c4 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 15 Oct 2025 23:13:34 +0200
Subject: [PATCH 44/70] update spirv target

---
 offload/CMakeLists.txt                            | 2 +-
 offload/plugins-nextgen/level_zero/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 041255dc8b01f..a72b213be8b54 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -226,7 +226,7 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} riscv64-unknown-linux
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} riscv64-unknown-linux-gnu-LTO")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} loongarch64-unknown-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} loongarch64-unknown-linux-gnu-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} spirv64")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} spirv64-intel")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 231f6d0b39845..e9e034a4599d3 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -57,7 +57,7 @@ option(LIBOMPTARGET_FORCE_LEVELZERO_TESTS "Build Level Zero libomptarget tests"
 if (LIBOMPTARGET_FOUND_INTELGPU_GPU OR LIBOMPTARGET_FORCE_LEVELZERO_TESTS)
   # Report to the parent scope that we are building a plugin for intelgpu
   set(LIBOMPTARGET_SYSTEM_TARGETS
-      "${LIBOMPTARGET_SYSTEM_TARGETS} spirv64" PARENT_SCOPE)
+      "${LIBOMPTARGET_SYSTEM_TARGETS} spirv64-intel" PARENT_SCOPE)
   list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.level_zero")
   set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
 else()

>From 2d7cc66bbcaa43f355298f5d9d262b8084e4e871 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 20 Oct 2025 15:59:30 +0200
Subject: [PATCH 45/70] address some reviews

---
 offload/include/PerThreadTable.h                     | 10 +++++-----
 offload/plugins-nextgen/level_zero/include/L0Defs.h  | 10 ----------
 .../plugins-nextgen/level_zero/include/L0Device.h    |  4 +++-
 .../plugins-nextgen/level_zero/include/L0Kernel.h    |  6 ++++--
 .../plugins-nextgen/level_zero/include/L0Plugin.h    | 12 +++++-------
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp  |  8 --------
 6 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 0241370953c67..fa4bc230a1fea 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -39,13 +39,13 @@ template <typename ObjectType> struct PerThread {
 
 private:
   PerThreadData &getThreadData() {
-    static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
-    if (!ThData) {
-      ThData = std::make_shared<PerThreadData>();
+    static thread_local std::shared_ptr<PerThreadData> ThreadData = nullptr;
+    if (!ThreadData) {
+      ThreadData = std::make_shared<PerThreadData>();
       std::lock_guard<std::mutex> Lock(Mtx);
-      ThreadDataList.push_back(ThData);
+      ThreadDataList.push_back(ThreadData);
     }
-    return *ThData;
+    return *ThreadData;
   }
 
 protected:
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 47dc25b85ce92..ed5c08730056f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -25,12 +25,6 @@ enum class AllocOptionTy : int32_t {
   ALLOC_OPT_SLM = 4,
 };
 
-#ifndef EXTRACT_BITS
-// MSB=63, LSB=0
-#define EXTRACT_BITS(I64, HIGH, LOW)                                           \
-  (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
-#endif
-
 namespace llvm::omp::target::plugin {
 
 /// Default alignmnet for allocation
@@ -58,10 +52,6 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
                                                     ErrFmt, Args..., Desc);
 }
 
-#define L0_UNIMPLEMENTED_ERR                                                   \
-  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",     \
-                       __func__);
-
 } // namespace llvm::omp::target::plugin
 
 #endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 3c6f6cb89cf2c..264a9f47b2641 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -612,7 +612,9 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
                             AsyncInfoWrapperTy &AsyncInfo) override{
-      L0_UNIMPLEMENTED_ERR}
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "enqueueHostCallImpl not implemented yet");
+  }
 
   /* Event routines are used to ensure ordering between dataTransfers. Instead
    * of adding extra events in the queues, we make sure they're ordered by
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index c5a3528dd2974..dcc829286d8ef 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -140,8 +140,10 @@ class L0KernelTy : public GenericKernelTy {
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
   Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
-                                  uint64_t DynamicMemSize) const override{
-      L0_UNIMPLEMENTED_ERR}
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "maxGroupSize not implemented yet");
+  }
 
   ze_kernel_handle_t getZeKernel() const {
     return zeKernel;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 9fbdafa288592..b910406871068 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -46,8 +46,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   /// L0 plugin global options
   static L0OptionsTy Options;
 
-  std::mutex GlobalMutex;
-
   /// Common pool of AsyncQueue
   AsyncQueuePoolTy AsyncQueuePool;
 
@@ -64,8 +62,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 
   static const auto &getOptions() { return Options; }
 
-  auto &getGlobalMutex() { return GlobalMutex; }
-
   struct DevicesRangeTy {
     using iterator = DeviceContainerTy::iterator;
 
@@ -122,9 +118,11 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
                                 int32_t NumDevices) override;
   GenericGlobalHandlerTy *createGlobalHandler() override;
-  uint16_t getMagicElfBits() const override;
-  Triple::ArchType getTripleArch() const override;
-  const char *getName() const override;
+
+  uint16_t getMagicElfBits() const override { return ELF::EM_INTELGT; }
+  Triple::ArchType getTripleArch() const override { return Triple::spirv64; }
+  const char *getName() const override { return GETNAME(TARGET_NAME); }
+
   Expected<bool> isELFCompatible(uint32_t DeviceId,
                                  StringRef Image) const override;
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index b134fb659738e..37e26018d21c0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -176,14 +176,6 @@ GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
   return new L0GlobalHandlerTy();
 }
 
-uint16_t LevelZeroPluginTy::getMagicElfBits() const { return ELF::EM_INTELGT; }
-
-Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
-  return Triple::spirv64;
-}
-
-const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
-
 Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
   return Plugin::success();
 }

>From 91e51edf890d655e98c54ace19bf08652f4b58f1 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 20 Oct 2025 17:49:01 +0200
Subject: [PATCH 46/70] address more reviews

---
 .../level_zero/include/L0Device.h             | 52 ++++++++-----------
 .../level_zero/include/L0Kernel.h             | 20 +++++--
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 264a9f47b2641..29cb7faacaa09 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -106,19 +106,10 @@ class L0DeviceTLSTy {
   L0DeviceTLSTy() = default;
   ~L0DeviceTLSTy() {
     // assert all fields are nullptr on destruction
-    assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
-    assert(CopyCmdList == nullptr &&
-           "CopyCmdList is not nullptr on destruction");
-    assert(LinkCopyCmdList == nullptr &&
-           "LinkCopyCmdList is not nullptr on destruction");
-    assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
-    assert(CopyCmdQueue == nullptr &&
-           "CopyCmdQueue is not nullptr on destruction");
-    assert(LinkCopyCmdQueue == nullptr &&
-           "LinkCopyCmdQueue is not nullptr on destruction");
-    assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
-    assert(ImmCopyCmdList == nullptr &&
-           "ImmCopyCmdList is not nullptr on destruction");
+    assert(!CmdList && !CopyCmdList && !LinkCopyCmdList && !CmdQueue &&
+           !CopyCmdQueue && !LinkCopyCmdQueue && !ImmCmdList &&
+           !ImmCopyCmdList &&
+           "L0DeviceTLSTy destroyed without clearing resources");
   }
 
   L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
@@ -179,28 +170,28 @@ class L0DeviceTLSTy {
   }
 
   auto getImmCmdList() const { return ImmCmdList; }
-  void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
-    ImmCmdList = _ImmCmdList;
+  void setImmCmdList(ze_command_list_handle_t ImmCmdListIn) {
+    ImmCmdList = ImmCmdListIn;
   }
 
   auto getImmCopyCmdList() const { return ImmCopyCmdList; }
-  void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
-    ImmCopyCmdList = _ImmCopyCmdList;
+  void setImmCopyCmdList(ze_command_list_handle_t ImmCopyCmdListIn) {
+    ImmCopyCmdList = ImmCopyCmdListIn;
   }
 
   auto getCmdQueue() const { return CmdQueue; }
-  void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
-    CmdQueue = _CmdQueue;
+  void setCmdQueue(ze_command_queue_handle_t CmdQueueIn) {
+    CmdQueue = CmdQueueIn;
   }
 
   auto getCopyCmdQueue() const { return CopyCmdQueue; }
-  void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
-    CopyCmdQueue = _CopyCmdQueue;
+  void setCopyCmdQueue(ze_command_queue_handle_t CopyCmdQueueIn) {
+    CopyCmdQueue = CopyCmdQueueIn;
   }
 
   auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
-  void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
-    LinkCopyCmdQueue = _LinkCopyCmdQueue;
+  void setLinkCopyCmdQueue(ze_command_queue_handle_t LinkCopyCmdQueueIn) {
+    LinkCopyCmdQueue = LinkCopyCmdQueueIn;
   }
 };
 
@@ -611,18 +602,17 @@ class L0DeviceTy final : public GenericDeviceTy {
   hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
   Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
-                            AsyncInfoWrapperTy &AsyncInfo) override{
+                            AsyncInfoWrapperTy &AsyncInfo) override {
     return Plugin::error(ErrorCode::UNIMPLEMENTED,
                          "enqueueHostCallImpl not implemented yet");
   }
 
-  /* Event routines are used to ensure ordering between dataTransfers. Instead
-   * of adding extra events in the queues, we make sure they're ordered by
-   * using the events from the data submission APIs so we don't need to support
-   * these routines.
-   * They still need to report succes to indicate the event are handled
-   * somewhere waitEvent and syncEvent should remain unimplemented
-   */
+  // Event routines are used to ensure ordering between dataTransfers. Instead
+  // of adding extra events in the queues, we make sure they're ordered by
+  // using the events from the data submission APIs so we don't need to support
+  // these routines.
+  // They still need to report succes to indicate the event are handled
+  // somewhere waitEvent and syncEvent should remain unimplemented
   Expected<bool> isEventCompleteImpl(void *EventPtr,
                                      AsyncInfoWrapperTy &) override {
     return true;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index dcc829286d8ef..fe51558765ca3 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -27,6 +27,10 @@ struct TgtLoopDescTy {
   int64_t Lb = 0;     // The lower bound of the i-th loop
   int64_t Ub = 0;     // The upper bound of the i-th loop
   int64_t Stride = 0; // The stride of the i-th loop
+
+  bool operator==(const TgtLoopDescTy &other) const {
+    return Lb == other.Lb && Ub == other.Ub && Stride == other.Stride;
+  }
 };
 
 struct TgtNDRangeDescTy {
@@ -34,6 +38,14 @@ struct TgtNDRangeDescTy {
   int32_t DistributeDim = 0; // Dimensions lower than this one
                              // must end up in one WG
   TgtLoopDescTy Levels[3];   // Up to 3 loops
+
+  bool operator==(const TgtNDRangeDescTy &other) const {
+    return NumLoops == other.NumLoops && DistributeDim == other.DistributeDim &&
+           std::equal(Levels, Levels + 3, other.Levels);
+  }
+  bool operator!=(const TgtNDRangeDescTy &other) const {
+    return !(*this == other);
+  }
 };
 
 /// Kernel properties.
@@ -63,9 +75,9 @@ struct KernelPropertiesTy {
                         uint32_t *GroupSizesOut,
                         ze_group_count_t &GroupCountsOut,
                         bool &AllowCooperativeOut) const {
-    if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+    if (!LoopDescPtr && LoopDescInit != LoopDesc)
       return false;
-    if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+    if (LoopDescPtr && *LoopDescPtr != LoopDesc)
       return false;
     if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
       return false;
@@ -145,9 +157,7 @@ class L0KernelTy : public GenericKernelTy {
                          "maxGroupSize not implemented yet");
   }
 
-  ze_kernel_handle_t getZeKernel() const {
-    return zeKernel;
-  }
+  ze_kernel_handle_t getZeKernel() const { return zeKernel; }
 
   int32_t getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                          int32_t ThreadLimit, uint32_t *GroupSizes,

>From 12ab478d1c66e56286fe0d3ad22c3670df67c67d Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 20 Oct 2025 19:40:00 +0200
Subject: [PATCH 47/70] address more comments

---
 .../level_zero/include/L0Context.h            |  4 +-
 .../level_zero/include/L0Device.h             |  2 +-
 .../level_zero/include/L0Memory.h             | 37 +++++++------------
 .../level_zero/include/L0Plugin.h             |  4 --
 .../level_zero/src/L0Memory.cpp               | 37 ++++++++++++++-----
 .../level_zero/src/L0Plugin.cpp               | 26 ++++---------
 6 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 29d01bb7b2a2a..926ac51d6e669 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -81,7 +81,9 @@ class L0ContextTy {
   /// Release resources
   ~L0ContextTy() {
     EventPool.deinit();
-    HostMemAllocator.deinit();
+    auto Err = HostMemAllocator.deinit();
+    if (Err)
+     consumeError(std::move(Err));
     if (zeContext)
       CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
   }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 29cb7faacaa09..60cf413e93da9 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -308,7 +308,7 @@ class L0DeviceTy final : public GenericDeviceTy {
   Error initImpl(GenericPluginTy &Plugin) override;
   Error deinitImpl() override {
     Programs.clear();
-    return Plugin::success();
+    return MemAllocator.deinit();
   }
 
   auto getZeDevice() const { return zeDevice; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 9b02aa8568f96..9dff1dc76550e 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -27,14 +27,6 @@ namespace llvm::omp::target::plugin {
 
 class L0DeviceTy;
 
-#define ALLOC_KIND_TO_STR(Kind)                                                \
-  (Kind == TARGET_ALLOC_HOST                                                   \
-       ? "host memory"                                                         \
-       : (Kind == TARGET_ALLOC_SHARED                                          \
-              ? "shared memory"                                                \
-              : (Kind == TARGET_ALLOC_DEVICE ? "device memory"                 \
-                                             : "unknown memory")))
-
 // forward declarations
 struct L0OptionsTy;
 class L0DeviceTy;
@@ -85,10 +77,10 @@ struct MemAllocInfoTy {
 
   MemAllocInfoTy() = default;
 
-  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
-                 bool _ImplicitArg)
-      : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
-        ImplicitArg(_ImplicitArg) {}
+  MemAllocInfoTy(void *Base, size_t Size, int32_t Kind, bool InPool,
+                 bool ImplicitArg)
+      : Base(Base), Size(Size), Kind(Kind), InPool(InPool),
+        ImplicitArg(ImplicitArg) {}
 };
 
 /// Responsible for all activities involving memory allocation/deallocation.
@@ -130,7 +122,7 @@ class MemAllocatorTy {
       /// Number of slots in use
       uint32_t NumUsedSlots = 0;
       /// Cached available slot returned by the last dealloc() call
-      uint32_t FreeSlot = UINT32_MAX;
+      uint32_t FreeSlot = std::numeric_limits<uint32_t>::max();
       /// Marker for the currently used slots
       std::vector<bool> UsedSlots;
 
@@ -286,7 +278,7 @@ class MemAllocatorTy {
   /// Whether the device supports large memory allocation
   bool SupportsLargeMem = false;
   /// Cached max alloc size supported by device
-  uint64_t MaxAllocSize = INT64_MAX;
+  uint64_t MaxAllocSize;
   /// Map from allocation kind to memory statistics
   std::unordered_map<int32_t, MemStatTy> Stats;
   /// Map from allocation kind to memory pool
@@ -305,22 +297,20 @@ class MemAllocatorTy {
   bool IsHostMem = false;
   // Internal deallocation function to be called when already
   // hondling the Mtx lock
-  Error dealloc_locked(void *Ptr);
+  Error deallocLocked(void *Ptr);
 
 public:
-  MemAllocatorTy() = default;
+  MemAllocatorTy() : MaxAllocSize(std::numeric_limits<uint64_t>::max()) {}
 
   MemAllocatorTy(const MemAllocatorTy &) = delete;
   MemAllocatorTy(MemAllocatorTy &&) = delete;
   MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
   MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
 
+  ~MemAllocatorTy() {}
+
   /// Release resources and report statistics if requested
-  ~MemAllocatorTy() {
-    if (L0Context)
-      deinit(); // Release resources
-  }
-  void deinit();
+  Error deinit();
 
   /// Allocator only supports host memory
   bool supportsHostMem() { return IsHostMem; }
@@ -342,7 +332,7 @@ class MemAllocatorTy {
   /// Deallocate memory
   Error dealloc(void *Ptr) {
     std::lock_guard<std::mutex> Lock(Mtx);
-    return dealloc_locked(Ptr);
+    return deallocLocked(Ptr);
   }
 
   /// Check if the given memory location and offset belongs to any allocated
@@ -561,7 +551,8 @@ class StagingBufferTy {
     if (NeedToGrow)
       Ret = addBuffers();
     else
-      Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+      Ret = reinterpret_cast<void *>(
+          reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));
 
     if (!Ret)
       return nullptr;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index b910406871068..971e665b2954f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -79,10 +79,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
     return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
   }
 
-  /// Clean-up routine to be invoked by the destructor or
-  /// LevelZeroPluginTy::deinit.
-  void closeRTL();
-
   /// Find L0 devices and initialize device properties.
   /// Returns number of devices reported to omptarget.
   int32_t findDevices();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index c26e3fb328645..a067a83a19874 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -16,6 +16,21 @@
 
 namespace llvm::omp::target::plugin {
 
+#if LIBOMPTARGET_DEBUG
+static const char * AllocKindToStr(int32_t Kind) {
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    return "DEVICE";
+  case TARGET_ALLOC_HOST:
+    return "HOST";
+  case TARGET_ALLOC_SHARED:
+    return "SHARED";
+  default:
+    return "DEFAULT";
+  }
+}
+#endif
+
 void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
   if (isFull())
     return nullptr;
@@ -101,7 +116,7 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
     AllocMax = 2 * AllocMin;
     DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
        "requirements.\n",
-       AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+       AllocMax, AllocKindToStr(AllocKind));
   }
   assert(AllocMin < AllocMax &&
          "Invalid parameters while initializing memory pool");
@@ -133,7 +148,7 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
   DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
      "AllocMax = %zu, "
      "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
-     ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+     AllocKindToStr(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
      BlockCapacity, PoolSizeMax);
 }
 
@@ -195,7 +210,7 @@ void MemAllocatorTy::MemPoolTy::printUsage() {
     }
   }
 
-  DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+  DP("MemPool usage for %s, device " DPxMOD "\n", AllocKindToStr(AllocKind),
      DPxPTR(Allocator->Device));
 
   if (HasPoolAlloc) {
@@ -281,7 +296,7 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
       PoolSize += BlockSize;
     DP("New block allocation for %s pool: base = " DPxMOD
        ", size = %zu, pool size = %zu\n",
-       ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+       AllocKindToStr(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
     BucketStats[BucketId].first++;
   } else {
     BucketStats[BucketId].second++;
@@ -400,13 +415,16 @@ void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
 }
 
 /// Release resources and report statistics if requested
-void MemAllocatorTy::deinit() {
+Error MemAllocatorTy::deinit() {
+  if (!L0Context)
+      return Plugin::success();
+
   std::lock_guard<std::mutex> Lock(Mtx);
   // Release RTL-owned memory
   for (auto *M : MemOwned) {
-    auto Err = dealloc_locked(M);
+    auto Err = deallocLocked(M);
     if (Err)
-      consumeError(std::move(Err));
+      return Err;
   }
   // Release resources used in the pool
   Pools.clear();
@@ -416,7 +434,7 @@ void MemAllocatorTy::deinit() {
   if (getDebugLevel() > 0) {
     for (auto &Stat : Stats) {
       DP("Memory usage for %s, device " DPxMOD "\n",
-         ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+         AllocKindToStr(Stat.first), DPxPTR(Device));
       const auto &ST = Stat.second;
       if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
         DP("-- Not used\n");
@@ -434,6 +452,7 @@ void MemAllocatorTy::deinit() {
 
   // mark as deinitialized
   L0Context = nullptr;
+  return Plugin::success();
 }
 
 /// Allocate memory with the specified information
@@ -505,7 +524,7 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
 }
 
 /// Deallocate memory
-Error MemAllocatorTy::dealloc_locked(void *Ptr) {
+Error MemAllocatorTy::deallocLocked(void *Ptr) {
   MemAllocInfoTy Info;
   if (!AllocInfo.remove(Ptr, &Info)) {
     return Plugin::error(ErrorCode::BACKEND_FAILURE,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 37e26018d21c0..7cf47a5da7be5 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -92,15 +92,11 @@ int32_t LevelZeroPluginTy::findDevices() {
 
   llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
 
-  // helper lambda
-  auto addDevice = [&DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
-                                   int32_t SubId = -1, int32_t CCSId = -1) {
-    DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
-  };
   for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
     const auto zeDevice = RootDevices[RootId].zeDevice;
     auto *RootDriver = RootDevices[RootId].Driver;
-    addDevice(zeDevice, RootDriver, RootId);
+    DevicesToAdd.push_back(
+        {{zeDevice, static_cast<int32_t>(RootId), -1, -1}, RootDriver});
   }
   NumDevices = DevicesToAdd.size();
   auto DeviceId = 0;
@@ -141,18 +137,6 @@ int32_t LevelZeroPluginTy::findDevices() {
   return getNumRootDevices();
 }
 
-/// Clean-up routine to be invoked by the destructor or
-/// LevelZeroPluginTy::deinit.
-void LevelZeroPluginTy::closeRTL() {
-
-  ContextTLSTable.clear();
-  DeviceTLSTable.clear();
-  ThreadTLSTable.clear();
-  ContextList.clear();
-
-  DP("Plugin closed successfully\n");
-}
-
 Expected<int32_t> LevelZeroPluginTy::initImpl() {
   DP("Level0 NG plugin initialization\n");
   // process options before anything else
@@ -162,7 +146,11 @@ Expected<int32_t> LevelZeroPluginTy::initImpl() {
 
 Error LevelZeroPluginTy::deinitImpl() {
   DP("Deinit Level0 plugin!\n");
-  closeRTL();
+  ContextTLSTable.clear();
+  DeviceTLSTable.clear();
+  ThreadTLSTable.clear();
+  ContextList.clear();
+  DP("Level0 plugin deinitialized successfully\n");
   return Plugin::success();
 }
 

>From 3287b1ab07f593e16a8d1bf74608801eb6087965 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 15:12:37 +0200
Subject: [PATCH 48/70] change some containers

---
 offload/include/omptarget.h                       |  1 +
 .../level_zero/include/L0Context.h                |  2 +-
 .../plugins-nextgen/level_zero/include/L0Memory.h | 14 +++++++++++---
 .../plugins-nextgen/level_zero/src/L0Memory.cpp   | 15 +++++++--------
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 794b79e07674e..daaf83140205f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -101,6 +101,7 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
   TARGET_ALLOC_DEFAULT,
+  TARGET_ALLOC_LAST = TARGET_ALLOC_DEFAULT
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 926ac51d6e669..35f180bdd8c70 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -83,7 +83,7 @@ class L0ContextTy {
     EventPool.deinit();
     auto Err = HostMemAllocator.deinit();
     if (Err)
-     consumeError(std::move(Err));
+      consumeError(std::move(Err));
     if (zeContext)
       CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
   }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 9dff1dc76550e..fd7358e18e3c8 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -97,6 +97,10 @@ class MemAllocatorTy {
     size_t PeakUse[2] = {0, 0};   // Peak bytes used
     size_t NumAllocs[2] = {0, 0}; // Number of allocations
     MemStatTy() = default;
+
+    MemStatTy(const MemStatTy &) = default;
+    MemStatTy(MemStatTy &&) = default;
+    ~MemStatTy() = default;
   };
 
   /// Memory pool which enables reuse of already allocated blocks
@@ -230,10 +234,11 @@ class MemAllocatorTy {
 
   /// Allocation information maintained in the plugin
   class MemAllocInfoMapTy {
+    constexpr static int32_t MaxKind = TARGET_ALLOC_LAST + 1;
     /// Map from allocated pointer to allocation information
     std::map<void *, MemAllocInfoTy> Map;
     /// Map from target alloc kind to number of implicit arguments
-    std::map<int32_t, uint32_t> NumImplicitArgs;
+    std::array<uint32_t, MaxKind> NumImplicitArgs;
 
   public:
     /// Add allocation information to the map
@@ -268,7 +273,10 @@ class MemAllocatorTy {
 
     /// Returns the number of implicit arguments for the specified allocation
     /// kind.
-    size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+    size_t getNumImplicitArgs(int32_t Kind) {
+      assert(Kind >= 0 && Kind < MaxKind && "Invalid target allocation kind");
+      return NumImplicitArgs[Kind];
+    }
   }; // MemAllocInfoMapTy
 
   /// L0 context to use
@@ -280,7 +288,7 @@ class MemAllocatorTy {
   /// Cached max alloc size supported by device
   uint64_t MaxAllocSize;
   /// Map from allocation kind to memory statistics
-  std::unordered_map<int32_t, MemStatTy> Stats;
+  llvm::DenseMap<int32_t, MemStatTy> Stats;
   /// Map from allocation kind to memory pool
   std::unordered_map<int32_t, MemPoolTy> Pools;
   /// Memory pool dedicated to reduction scratch space
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index a067a83a19874..2440d0a794cd2 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -17,7 +17,7 @@
 namespace llvm::omp::target::plugin {
 
 #if LIBOMPTARGET_DEBUG
-static const char * AllocKindToStr(int32_t Kind) {
+static const char *AllocKindToStr(int32_t Kind) {
   switch (Kind) {
   case TARGET_ALLOC_DEVICE:
     return "DEVICE";
@@ -336,6 +336,7 @@ void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
     }
   }
   assert(Valid && "Invalid overlapping memory allocation");
+  assert(Kind >= 0 && Kind < MaxKind && "Invalid target allocation kind");
   if (ImplicitArg)
     NumImplicitArgs[Kind]++;
 }
@@ -367,8 +368,7 @@ void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
                     std::forward_as_tuple(Kind, this, Option));
     }
     if (getDebugLevel() > 0)
-      Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
-                    std::tuple<>{});
+      Stats.try_emplace(Kind);
   }
   ReductionPool = std::make_unique<MemPoolTy>(this, Option);
   CounterPool = std::make_unique<MemPoolTy>(this);
@@ -387,8 +387,7 @@ void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
                   std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
   }
   if (getDebugLevel() > 0)
-    Stats.emplace(std::piecewise_construct,
-                  std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+    Stats.try_emplace(TARGET_ALLOC_HOST);
 }
 
 void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
@@ -417,7 +416,7 @@ void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
 /// Release resources and report statistics if requested
 Error MemAllocatorTy::deinit() {
   if (!L0Context)
-      return Plugin::success();
+    return Plugin::success();
 
   std::lock_guard<std::mutex> Lock(Mtx);
   // Release RTL-owned memory
@@ -433,8 +432,8 @@ Error MemAllocatorTy::deinit() {
   // Report memory usage if requested
   if (getDebugLevel() > 0) {
     for (auto &Stat : Stats) {
-      DP("Memory usage for %s, device " DPxMOD "\n",
-         AllocKindToStr(Stat.first), DPxPTR(Device));
+      DP("Memory usage for %s, device " DPxMOD "\n", AllocKindToStr(Stat.first),
+         DPxPTR(Device));
       const auto &ST = Stat.second;
       if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
         DP("-- Not used\n");

>From 69abec035a12533812d55c6498573c2319f8e2f6 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 15:41:35 +0200
Subject: [PATCH 49/70] more container changes

---
 .../level_zero/include/L0Memory.h             | 19 +++++----
 .../level_zero/src/L0Memory.cpp               | 39 +++++++------------
 2 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index fd7358e18e3c8..680ee375cdd56 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -32,6 +32,8 @@ struct L0OptionsTy;
 class L0DeviceTy;
 class L0ContextTy;
 
+constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1;
+
 struct DynamicMemHeapTy {
   /// Base address memory is allocated from
   uintptr_t AllocBase = 0;
@@ -97,10 +99,6 @@ class MemAllocatorTy {
     size_t PeakUse[2] = {0, 0};   // Peak bytes used
     size_t NumAllocs[2] = {0, 0}; // Number of allocations
     MemStatTy() = default;
-
-    MemStatTy(const MemStatTy &) = default;
-    MemStatTy(MemStatTy &&) = default;
-    ~MemStatTy() = default;
   };
 
   /// Memory pool which enables reuse of already allocated blocks
@@ -234,11 +232,10 @@ class MemAllocatorTy {
 
   /// Allocation information maintained in the plugin
   class MemAllocInfoMapTy {
-    constexpr static int32_t MaxKind = TARGET_ALLOC_LAST + 1;
     /// Map from allocated pointer to allocation information
     std::map<void *, MemAllocInfoTy> Map;
     /// Map from target alloc kind to number of implicit arguments
-    std::array<uint32_t, MaxKind> NumImplicitArgs;
+    std::array<uint32_t, MaxMemKind> NumImplicitArgs;
 
   public:
     /// Add allocation information to the map
@@ -274,7 +271,8 @@ class MemAllocatorTy {
     /// Returns the number of implicit arguments for the specified allocation
     /// kind.
     size_t getNumImplicitArgs(int32_t Kind) {
-      assert(Kind >= 0 && Kind < MaxKind && "Invalid target allocation kind");
+      assert(Kind >= 0 && Kind < MaxMemKind &&
+             "Invalid target allocation kind");
       return NumImplicitArgs[Kind];
     }
   }; // MemAllocInfoMapTy
@@ -288,9 +286,10 @@ class MemAllocatorTy {
   /// Cached max alloc size supported by device
   uint64_t MaxAllocSize;
   /// Map from allocation kind to memory statistics
-  llvm::DenseMap<int32_t, MemStatTy> Stats;
+  std::array<MemStatTy, MaxMemKind> Stats;
   /// Map from allocation kind to memory pool
-  std::unordered_map<int32_t, MemPoolTy> Pools;
+  std::array<std::unique_ptr<MemPoolTy>, MaxMemKind> Pools;
+
   /// Memory pool dedicated to reduction scratch space
   std::unique_ptr<MemPoolTy> ReductionPool;
   /// Memory pool dedicated to reduction counters
@@ -371,7 +370,7 @@ class MemAllocatorTy {
 
   /// Log memory allocation/deallocation
   void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
-    if (Stats.count(Kind) == 0)
+    if (Kind < 0 || Kind >= MaxMemKind)
       return; // Stat is disabled
 
     auto &ST = Stats[Kind];
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 2440d0a794cd2..4afe6d1701ce5 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -336,7 +336,7 @@ void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
     }
   }
   assert(Valid && "Invalid overlapping memory allocation");
-  assert(Kind >= 0 && Kind < MaxKind && "Invalid target allocation kind");
+  assert(Kind >= 0 && Kind < MaxMemKind && "Invalid target allocation kind");
   if (ImplicitArg)
     NumImplicitArgs[Kind]++;
 }
@@ -364,11 +364,8 @@ void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
   for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
     if (Option.MemPoolInfo.count(Kind) > 0) {
       std::lock_guard<std::mutex> Lock(Mtx);
-      Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
-                    std::forward_as_tuple(Kind, this, Option));
+      Pools[Kind] = std::make_unique<MemPoolTy>(Kind, this, Option);
     }
-    if (getDebugLevel() > 0)
-      Stats.try_emplace(Kind);
   }
   ReductionPool = std::make_unique<MemPoolTy>(this, Option);
   CounterPool = std::make_unique<MemPoolTy>(this);
@@ -382,12 +379,9 @@ void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
   this->L0Context = &Driver;
   if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
     std::lock_guard<std::mutex> Lock(Mtx);
-    Pools.emplace(std::piecewise_construct,
-                  std::forward_as_tuple(TARGET_ALLOC_HOST),
-                  std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+    Pools[TARGET_ALLOC_HOST] =
+        std::make_unique<MemPoolTy>(TARGET_ALLOC_HOST, this, Option);
   }
-  if (getDebugLevel() > 0)
-    Stats.try_emplace(TARGET_ALLOC_HOST);
 }
 
 void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
@@ -425,8 +419,6 @@ Error MemAllocatorTy::deinit() {
     if (Err)
       return Err;
   }
-  // Release resources used in the pool
-  Pools.clear();
   ReductionPool.reset(nullptr);
   CounterPool.reset(nullptr);
   // Report memory usage if requested
@@ -434,18 +426,17 @@ Error MemAllocatorTy::deinit() {
     for (auto &Stat : Stats) {
       DP("Memory usage for %s, device " DPxMOD "\n", AllocKindToStr(Stat.first),
          DPxPTR(Device));
-      const auto &ST = Stat.second;
-      if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+      if (Stat.NumAllocs[0] == 0 && Stat.NumAllocs[1] == 0) {
         DP("-- Not used\n");
         continue;
       }
       DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
-      DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
-      DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
-      DP("-- Freed    : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
-      DP("-- InUse    : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
-      DP("-- PeakUse  : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
-      DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+      DP("-- Requested: %12zu, %12zu\n", Stat.Requested[0], Stat.Requested[1]);
+      DP("-- Allocated: %12zu, %12zu\n", Stat.Allocated[0], Stat.Allocated[1]);
+      DP("-- Freed    : %12zu, %12zu\n", Stat.Freed[0], Stat.Freed[1]);
+      DP("-- InUse    : %12zu, %12zu\n", Stat.InUse[0], Stat.InUse[1]);
+      DP("-- PeakUse  : %12zu, %12zu\n", Stat.PeakUse[0], Stat.PeakUse[1]);
+      DP("-- NumAllocs: %12zu, %12zu\n", Stat.NumAllocs[0], Stat.NumAllocs[1]);
     }
   }
 
@@ -477,7 +468,7 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
       (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
   const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
 
-  if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+  if ((Pools[Kind] != nullptr && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
     // Pool is enabled for the allocation kind, and we do not use any memory
     // advice. We should avoid using pool if there is any meaningful memory
     // advice not to affect sibling allocation in the same block.
@@ -489,7 +480,7 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
     else if (UseZeroInitPool)
       AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
     else
-      AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+      AllocBase = Pools[Kind]->alloc(AllocSize, PoolAllocSize);
     if (AllocBase) {
       uintptr_t Base = (uintptr_t)AllocBase;
       if (Align > 0)
@@ -533,8 +524,8 @@ Error MemAllocatorTy::deallocLocked(void *Ptr) {
   }
   if (Info.InPool) {
     size_t DeallocSize = 0;
-    if (Pools.count(Info.Kind) > 0)
-      DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+    if (Pools[Info.Kind] != nullptr)
+      DeallocSize = Pools[Info.Kind]->dealloc(Info.Base);
     if (DeallocSize == 0) {
       // Try reduction scratch pool
       DeallocSize = ReductionPool->dealloc(Info.Base);

>From b7e95abf1a7ace37b66137d6ef836f8439c21694 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 16:26:49 +0200
Subject: [PATCH 50/70] move unsafe code out of destructors

---
 .../level_zero/include/L0Context.h            |  9 ++++---
 .../level_zero/include/L0Device.h             |  6 +----
 .../level_zero/include/L0Kernel.h             |  9 ++++---
 .../level_zero/include/L0Memory.h             |  9 +++----
 .../level_zero/include/L0Program.h            |  5 ++--
 .../level_zero/src/L0Device.cpp               |  7 +++++
 .../level_zero/src/L0Memory.cpp               | 26 +++++++++++++++----
 .../level_zero/src/L0Plugin.cpp               |  3 +++
 .../level_zero/src/L0Program.cpp              | 10 +++----
 9 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 35f180bdd8c70..340df41e33cef 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -79,13 +79,16 @@ class L0ContextTy {
   L0ContextTy &operator=(const L0ContextTy &&) = delete;
 
   /// Release resources
-  ~L0ContextTy() {
+  ~L0ContextTy() {}
+
+  Error deinit() {
     EventPool.deinit();
     auto Err = HostMemAllocator.deinit();
     if (Err)
-      consumeError(std::move(Err));
+      return Err;
     if (zeContext)
-      CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+      CALL_ZE_RET_ERROR(zeContextDestroy, zeContext);
+    return Error::success();
   }
 
   auto &getPlugin() const { return Plugin; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 60cf413e93da9..a6dc1d9784bf8 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -306,11 +306,7 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   Error setContext() override { return Plugin::success(); }
   Error initImpl(GenericPluginTy &Plugin) override;
-  Error deinitImpl() override {
-    Programs.clear();
-    return MemAllocator.deinit();
-  }
-
+  Error deinitImpl() override;
   auto getZeDevice() const { return zeDevice; }
 
   const L0ContextTy &getL0Context() const { return l0Context; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index fe51558765ca3..ba25ab3abdabf 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -132,10 +132,7 @@ class L0KernelTy : public GenericKernelTy {
 public:
   /// Create a L0 kernel with a name and an execution mode.
   L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
-  ~L0KernelTy() {
-    if (zeKernel)
-      CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
-  }
+  ~L0KernelTy() {}
   L0KernelTy(const L0KernelTy &) = delete;
   L0KernelTy(L0KernelTy &&) = delete;
   L0KernelTy &operator=(const L0KernelTy &) = delete;
@@ -150,6 +147,10 @@ class L0KernelTy : public GenericKernelTy {
                    uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+  Error deinit() {
+    CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
+    return Plugin::success();
+  }
 
   Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
                                   uint64_t DynamicMemSize) const override {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 680ee375cdd56..5fc2cf5784870 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -218,10 +218,11 @@ class MemAllocatorTy {
     MemPoolTy(MemPoolTy &&) = delete;
     MemPoolTy &operator=(const MemPoolTy &) = delete;
     MemPoolTy &operator=(const MemPoolTy &&) = delete;
+    ~MemPoolTy() {}
 
     void printUsage();
     /// Release resources used in the pool.
-    ~MemPoolTy();
+    Error deinit();
 
     /// Allocate the requested size of memory from this pool.
     /// AllocSize is the chunk size internally used for the returned memory.
@@ -313,7 +314,6 @@ class MemAllocatorTy {
   MemAllocatorTy(MemAllocatorTy &&) = delete;
   MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
   MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
-
   ~MemAllocatorTy() {}
 
   /// Release resources and report statistics if requested
@@ -517,10 +517,7 @@ class StagingBufferTy {
   StagingBufferTy &operator=(const StagingBufferTy &) = delete;
   StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
 
-  ~StagingBufferTy() {
-    if (initialized())
-      clear();
-  }
+  ~StagingBufferTy() { }
 
   void clear() {
     ze_result_t Rc;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index 520bfa688a5af..965738dae07e9 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -75,14 +75,15 @@ class L0ProgramTy : public DeviceImageTy {
   L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
               std::unique_ptr<MemoryBuffer> Image)
       : DeviceImageTy(ImageId, Device, std::move(Image)) {}
-
-  ~L0ProgramTy();
+  ~L0ProgramTy() {}
 
   L0ProgramTy(const L0ProgramTy &other) = delete;
   L0ProgramTy(L0ProgramTy &&) = delete;
   L0ProgramTy &operator=(const L0ProgramTy &) = delete;
   L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
 
+  Error deinit();
+
   static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
     return static_cast<L0ProgramTy &>(Device);
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index d2938882d6e76..74014e287e7a0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -223,6 +223,13 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
   return Plugin::success();
 }
 
+Error L0DeviceTy::deinitImpl() {
+  for (auto &PGM : Programs)
+    if (auto Err = PGM.deinit())
+      return Err;
+  return MemAllocator.deinit();
+}
+
 int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                 bool ReleaseQueue) {
   bool IsAsync = AsyncInfo && asyncEnabled();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 4afe6d1701ce5..d8fcaa90694c9 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -233,7 +233,7 @@ void MemAllocatorTy::MemPoolTy::printUsage() {
 }
 
 /// Release resources used in the pool.
-MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+Error MemAllocatorTy::MemPoolTy::deinit() {
   const int DebugLevel = getDebugLevel();
   if (DebugLevel > 0)
     printUsage();
@@ -241,11 +241,12 @@ MemAllocatorTy::MemPoolTy::~MemPoolTy() {
     for (auto *Block : Bucket) {
       if (DebugLevel > 0)
         Allocator->log(0, Block->Size, AllocKind);
-      CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
-                       reinterpret_cast<void *>(Block->Base));
+      CALL_ZE_RET_ERROR(zeMemFree, Allocator->L0Context->getZeContext(),
+                        reinterpret_cast<void *>(Block->Base));
       delete Block;
     }
   }
+  return Plugin::success();
 }
 
 /// Allocate the requested size of memory from this pool.
@@ -419,8 +420,23 @@ Error MemAllocatorTy::deinit() {
     if (Err)
       return Err;
   }
-  ReductionPool.reset(nullptr);
-  CounterPool.reset(nullptr);
+  for (auto &Pool : Pools) {
+    if (Pool) {
+      if (auto Err = Pool->deinit())
+        return Err;
+      Pool.reset(nullptr);
+    }
+  }
+  if (ReductionPool) {
+    if (auto Err = ReductionPool->deinit())
+      return Err;
+    ReductionPool.reset(nullptr);
+  }
+  if (CounterPool) {
+    if (auto Err = CounterPool->deinit())
+      return Err;
+    CounterPool.reset(nullptr);
+  }
   // Report memory usage if requested
   if (getDebugLevel() > 0) {
     for (auto &Stat : Stats) {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 7cf47a5da7be5..3df7b1e168712 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -149,6 +149,9 @@ Error LevelZeroPluginTy::deinitImpl() {
   ContextTLSTable.clear();
   DeviceTLSTable.clear();
   ThreadTLSTable.clear();
+  for (auto &Context : ContextList)
+    if (auto Err = Context.deinit())
+      return Err;
   ContextList.clear();
   DP("Level0 plugin deinitialized successfully\n");
   return Plugin::success();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 0cd88b4ea97ef..2c3d56b68f7ae 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -47,16 +47,16 @@ inline L0DeviceTy &L0ProgramTy::getL0Device() const {
   return L0DeviceTy::makeL0Device(getDevice());
 }
 
-L0ProgramTy::~L0ProgramTy() {
+Error L0ProgramTy::deinit() {
   for (auto *Kernel : Kernels) {
-    // We need explicit destructor and deallocate calls to release the kernels
-    // created by `GenericDeviceTy::constructKernel()`.
-    Kernel->~L0KernelTy();
+    if (auto Err = Kernel->deinit())
+      return Err;
     getL0Device().getPlugin().free(Kernel);
   }
   for (auto Module : Modules) {
-    CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+    CALL_ZE_RET_ERROR(zeModuleDestroy, Module);
   }
+  return Plugin::success();
 }
 
 void L0ProgramTy::setLibModule() {

>From 91f103397cf76a31b8f1460144c2c4f8c82fc7e0 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 17:29:54 +0200
Subject: [PATCH 51/70] Remove legacy error checking Pt 1

---
 .../level_zero/include/L0Device.h             |  29 +-
 .../level_zero/include/L0Memory.h             |   6 +-
 .../level_zero/include/L0Program.h            |  17 +-
 .../level_zero/include/L0Trace.h              |   5 +
 .../level_zero/src/L0Device.cpp               | 363 ++++++++----------
 .../level_zero/src/L0Memory.cpp               |  10 +-
 .../level_zero/src/L0Program.cpp              |  82 ++--
 7 files changed, 234 insertions(+), 278 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index a6dc1d9784bf8..0406f856de9c3 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -258,12 +258,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   /// The current size of the global device memory pool (managed by us).
   uint64_t HeapSize = 1L << 23L /*8MB=*/;
 
-  int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
-  int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
-                     __tgt_async_info *AsyncInfo);
-  int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
-                       __tgt_async_info *AsyncInfo);
-
   bool shouldSetupDeviceMemoryPool() const override { return false; }
   DeviceArchTy computeArch() const;
 
@@ -336,13 +330,12 @@ class L0DeviceTy final : public GenericDeviceTy {
     return nullptr;
   }
 
-  int32_t buildAllKernels() {
+  Error buildAllKernels() {
     for (auto &PGM : Programs) {
-      int32_t RC = PGM.loadModuleKernels();
-      if (RC != OFFLOAD_SUCCESS)
-        return RC;
+      if (auto Err = PGM.loadModuleKernels())
+        return Err;
     }
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
   // add a new program to the device. Return a reference to the new program
@@ -488,17 +481,17 @@ class L0DeviceTy final : public GenericDeviceTy {
   ze_command_list_handle_t getImmCopyCmdList();
 
   /// Enqueue copy command
-  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
-                         __tgt_async_info *AsyncInfo = nullptr,
-                         bool UseCopyEngine = true);
+  Error enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                       __tgt_async_info *AsyncInfo = nullptr,
+                       bool UseCopyEngine = true);
 
   /// Enqueue asynchronous copy command
-  int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
-                              __tgt_async_info *AsyncInfo, bool CopyTo = true);
+  Error enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                            __tgt_async_info *AsyncInfo, bool CopyTo = true);
 
   /// Enqueue fill command
-  int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
-                         size_t Size);
+  Error enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+                       size_t Size);
 
   /// Driver related functions
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 5fc2cf5784870..444ead6ea4a76 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -388,10 +388,10 @@ class MemAllocatorTy {
   }
 
   /// Perform copy operation
-  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+  Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
 
   /// Perform memory fill operation
-  int32_t enqueueMemSet(void *Dst, int8_t Value, size_t Size);
+  Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
 
 }; /// MemAllocatorTy
 
@@ -517,7 +517,7 @@ class StagingBufferTy {
   StagingBufferTy &operator=(const StagingBufferTy &) = delete;
   StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
 
-  ~StagingBufferTy() { }
+  ~StagingBufferTy() {}
 
   void clear() {
     ze_result_t Rc;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index 965738dae07e9..15b9255f46279 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -56,9 +56,9 @@ class L0ProgramTy : public DeviceImageTy {
   bool IsLibModule = false;
 
   /// Build a single module with the given image, build option, and format.
-  int32_t addModule(const size_t Size, const uint8_t *Image,
-                    const std::string_view BuildOption,
-                    ze_module_format_t Format);
+  Error addModule(const size_t Size, const uint8_t *Image,
+                  const std::string_view BuildOption,
+                  ze_module_format_t Format);
   /// Read file and return the size of the binary if successful.
   size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
   void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
@@ -89,22 +89,21 @@ class L0ProgramTy : public DeviceImageTy {
   }
 
   /// Build modules from the target image description
-  int32_t buildModules(const std::string_view BuildOptions);
+  Error buildModules(const std::string_view BuildOptions);
 
   /// Link modules stored in \p Modules.
-  int32_t linkModules();
+  Error linkModules();
 
   /// Loads the kernels names from all modules
-  int32_t loadModuleKernels();
+  Error loadModuleKernels();
 
   /// Read data from the location in the device image which corresponds to the
   /// specified global variable name.
-  int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+  Error readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
 
   /// Write data to the location in the device image which corresponds to the
   /// specified global variable name.
-  int32_t writeGlobalVariable(const char *Name, size_t Size,
-                              const void *HostPtr);
+  Error writeGlobalVariable(const char *Name, size_t Size, const void *HostPtr);
 
   /// Looks up an OpenMP declare target global variable with the given
   /// \p Name and \p Size in the device environment for the current device.
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index 0faa76171cbc9..22170b723a31b 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -70,6 +70,11 @@
   CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
 #define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
   CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR_MTX(Fn, Mtx, ...)                                   \
+  CALL_ZE_RET_MTX(                                                            \
+    Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",          \
+    STR(Fn), rc, getZeErrorName(rc)), Fn, Mtx, __VA_ARGS__)
+
 
 /// For thread-safe functions
 #define CALL_ZE_RET(Ret, Fn, ...)                                              \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 74014e287e7a0..07bce14442edf 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -230,25 +230,67 @@ Error L0DeviceTy::deinitImpl() {
   return MemAllocator.deinit();
 }
 
-int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
-                                bool ReleaseQueue) {
-  bool IsAsync = AsyncInfo && asyncEnabled();
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                           int32_t ImageId) {
+  auto *PGM = getProgramFromImage(TgtImage->getMemBufferRef());
+  if (PGM) {
+    // Program already exists
+    return PGM;
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+       DPxPTR(TgtImage->getBufferStart()));
+
+  const auto &Options = getPlugin().getOptions();
+  std::string CompilationOptions(Options.CompilationOptions);
+  CompilationOptions += " " + Options.UserCompilationOptions;
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+  CompilationOptions += " ";
+  CompilationOptions += Options.InternalCompilationOptions;
+  auto &Program = addProgram(ImageId, std::move(TgtImage));
+
+  if (auto Err = Program.buildModules(CompilationOptions))
+    return Err;
+
+  if (auto Err = Program.linkModules())
+    return Err;
+
+  if (auto Err = Program.loadModuleKernels())
+    return Err;
+
+  return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+  // Ignoring for now
+  // TODO: call properly L0Program unload
+  return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                  bool ReleaseQueue) {
+  bool IsAsync = asyncEnabled();
   if (!IsAsync)
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
 
   auto &Plugin = getPlugin();
 
-  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo.Queue;
 
   if (!AsyncQueue->WaitEvents.empty()) {
     const auto &WaitEvents = AsyncQueue->WaitEvents;
     if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
       // Only need to wait for the last event
-      CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+      CALL_ZE_RET_ERROR(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
       // Synchronize on kernel event to support printf()
       auto KE = AsyncQueue->KernelEvent;
       if (KE && KE != WaitEvents.back()) {
-        CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+        CALL_ZE_RET_ERROR(zeEventHostSynchronize, KE, UINT64_MAX);
       }
       for (auto &Event : WaitEvents) {
         releaseEvent(Event);
@@ -262,7 +304,7 @@ int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
       bool WaitDone = false;
       for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
         if (!WaitDone) {
-          CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+          CALL_ZE_RET_ERROR(zeEventHostSynchronize, *Itr, UINT64_MAX);
           if (*Itr == AsyncQueue->KernelEvent)
             WaitDone = true;
         }
@@ -284,17 +326,73 @@ int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
   if (ReleaseQueue) {
     Plugin.releaseAsyncQueue(AsyncQueue);
     getStagingBuffer().reset();
-    AsyncInfo->Queue = nullptr;
+    AsyncInfo.Queue = nullptr;
   }
-  return OFFLOAD_SUCCESS;
+
+  return Plugin::success();
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return false;
+
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (AsyncQueue->WaitEvents.empty())
+    return false;
+
+  return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return Plugin::success();
+
+  auto &Plugin = getPlugin();
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (!AsyncQueue->WaitEvents.empty())
+    return Plugin::success();
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  Plugin.releaseAsyncQueue(AsyncQueue);
+  getStagingBuffer().reset();
+  AsyncInfo.Queue = nullptr;
+
+  return Plugin::success();
 }
 
-int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
-                               __tgt_async_info *AsyncInfo) {
+Expected<void *> L0DeviceTy::allocate(size_t Size, void *HstPtr,
+                                      TargetAllocTy Kind) {
+  return dataAlloc(Size, /*Align=*/0, Kind,
+                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+                   /*DevMalloc=*/false);
+}
+
+Error L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+  return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
   if (Size == 0)
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
 
   auto &Plugin = getPlugin();
+  __tgt_async_info *AsyncInfo = AsyncInfoWrapper;
 
   const auto DeviceId = getDeviceId();
   bool IsAsync = AsyncInfo && asyncEnabled();
@@ -317,28 +415,30 @@ int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
       std::copy_n(static_cast<const char *>(HstPtr), Size,
                   static_cast<char *>(const_cast<void *>(SrcPtr)));
     }
-    int32_t RC;
-    if (IsAsync)
-      RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
-    else
-      RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
-    if (RC != OFFLOAD_SUCCESS)
-      return RC;
+    if (IsAsync) {
+      if (auto Err = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo))
+        return Err;
+    } else {
+      if (auto Err = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo))
+        return Err;
+    }
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
        IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
        DPxPTR(TgtPtr));
-
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
-int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
-                                 __tgt_async_info *AsyncInfo) {
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                   int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
   if (Size == 0)
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
 
   auto &Plugin = getPlugin();
+  __tgt_async_info *AsyncInfo = AsyncInfoWrapper;
+
   const auto DeviceId = getDeviceId();
   bool IsAsync = AsyncInfo && asyncEnabled();
   if (IsAsync && !AsyncInfo->Queue) {
@@ -372,14 +472,14 @@ int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
         getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
       DstPtr = getStagingBuffer().get(IsAsync);
     }
-    int32_t RC;
-    if (IsAsync)
-      RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
-                               /* CopyTo */ false);
-    else
-      RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
-    if (RC != OFFLOAD_SUCCESS)
-      return RC;
+    if (IsAsync) {
+      if (auto Err = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+                                         /* CopyTo */ false))
+        return Err;
+    } else {
+      if (auto Err = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo))
+        return Err;
+    }
     if (DstPtr != HstPtr) {
       if (IsAsync) {
         // Store delayed H2M data copies
@@ -395,134 +495,9 @@ int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
        "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
        IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
        DPxPTR(HstPtr));
-
-  return OFFLOAD_SUCCESS;
-}
-
-Expected<DeviceImageTy *>
-L0DeviceTy::loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
-                           int32_t ImageId) {
-  auto *PGM = getProgramFromImage(TgtImage->getMemBufferRef());
-  if (PGM) {
-    // Program already exists
-    return PGM;
-  }
-
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
-       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
-       DPxPTR(TgtImage->getBufferStart()));
-
-  const auto &Options = getPlugin().getOptions();
-  std::string CompilationOptions(Options.CompilationOptions);
-  CompilationOptions += " " + Options.UserCompilationOptions;
-
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
-       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
-
-  CompilationOptions += " ";
-  CompilationOptions += Options.InternalCompilationOptions;
-  auto &Program = addProgram(ImageId, std::move(TgtImage));
-
-  int32_t RC = Program.buildModules(CompilationOptions);
-  if (RC != OFFLOAD_SUCCESS)
-    return Plugin::check(RC, "Error in buildModules %d", RC);
-
-  RC = Program.linkModules();
-  if (RC != OFFLOAD_SUCCESS)
-    return Plugin::check(RC, "Error in linkModules %d", RC);
-
-  RC = Program.loadModuleKernels();
-  if (RC != OFFLOAD_SUCCESS)
-    return Plugin::check(RC, "Error in buildKernels %d", RC);
-
-  return &Program;
-}
-
-Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
-  // Ignoring for now
-  // TODO: call properly L0Program unload
   return Plugin::success();
 }
 
-Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
-                                  bool ReleaseQueue) {
-  if (!ReleaseQueue) {
-    return Plugin::error(ErrorCode::UNIMPLEMENTED,
-                         "Support for ReleaseQueue=false in %s"
-                         " not implemented yet\n",
-                         __func__);
-  }
-  int32_t RC = synchronize(&AsyncInfo, ReleaseQueue);
-  return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
-}
-
-Expected<bool>
-L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
-  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
-  if (!IsAsync)
-    return false;
-
-  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
-
-  if (AsyncQueue->WaitEvents.empty())
-    return false;
-
-  return true;
-}
-
-Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
-  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
-  if (!IsAsync)
-    return Plugin::success();
-
-  auto &Plugin = getPlugin();
-  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
-
-  if (!AsyncQueue->WaitEvents.empty())
-    return Plugin::success();
-
-  // Commit delayed USM2M copies
-  for (auto &USM2M : AsyncQueue->USM2MList) {
-    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
-                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
-  }
-  // Commit delayed H2M copies
-  for (auto &H2M : AsyncQueue->H2MList) {
-    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
-                static_cast<char *>(std::get<1>(H2M)));
-  }
-  Plugin.releaseAsyncQueue(AsyncQueue);
-  getStagingBuffer().reset();
-  AsyncInfo.Queue = nullptr;
-
-  return Plugin::success();
-}
-
-Expected<void *> L0DeviceTy::allocate(size_t Size, void *HstPtr,
-                                      TargetAllocTy Kind) {
-  return dataAlloc(Size, /*Align=*/0, Kind,
-                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
-                   /*DevMalloc=*/false);
-}
-
-Error L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
-  return dataDelete(TgtPtr);
-}
-
-Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
-                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
-  return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
-}
-
-Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
-                                   int64_t Size,
-                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
-  return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
-}
-
 Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
                                    void *DstPtr, int64_t Size,
                                    AsyncInfoWrapperTy &AsyncInfoWrapper) {
@@ -695,17 +670,17 @@ Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
   return Plugin::success();
 }
 
-int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
-                                   __tgt_async_info *AsyncInfo,
-                                   bool UseCopyEngine) {
+Error L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                                 __tgt_async_info *AsyncInfo,
+                                 bool UseCopyEngine) {
   ze_command_list_handle_t CmdList = nullptr;
   ze_command_queue_handle_t CmdQueue = nullptr;
 
   if (useImmForCopy()) {
     CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
-    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
-                     nullptr, 0, nullptr);
-    CALL_ZE_RET_FAIL(zeCommandListHostSynchronize, CmdList, UINT64_MAX);
+    CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                      nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListHostSynchronize, CmdList, UINT64_MAX);
   } else {
     if (UseCopyEngine) {
       CmdList = getCopyCmdList();
@@ -715,22 +690,22 @@ int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
       CmdQueue = getCmdQueue();
     }
 
-    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
-                     nullptr, 0, nullptr);
-    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
-    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
-                         CmdQueue, 1, &CmdList, nullptr);
-    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
-    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                      nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                          CmdQueue, 1, &CmdList, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
   }
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 /// Enqueue non-blocking memory copy. This function is invoked only when IMM is
 /// fully enabled and async mode is requested.
-int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
-                                        __tgt_async_info *AsyncInfo,
-                                        bool CopyTo) {
+Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                                      __tgt_async_info *AsyncInfo,
+                                      bool CopyTo) {
   const bool Ordered =
       (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
   ze_event_handle_t SignalEvent = getEvent();
@@ -748,44 +723,40 @@ int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
       NumWaitEvents = 0;
   }
   auto CmdList = getImmCopyCmdList();
-  CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
-                   SignalEvent, NumWaitEvents, WaitEvents);
+  CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                    SignalEvent, NumWaitEvents, WaitEvents);
   AsyncQueue->WaitEvents.push_back(SignalEvent);
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 /// Enqueue memory fill
-int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
-                                   size_t PatternSize, size_t Size) {
+Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+                                 size_t PatternSize, size_t Size) {
   if (useImmForCopy()) {
     const auto CmdList = getImmCopyCmdList();
     auto Event = getEvent();
-    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
-                     PatternSize, Size, Event, 0, nullptr);
-    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+    CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                      PatternSize, Size, Event, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
   } else {
     auto CmdList = getCopyCmdList();
     const auto CmdQueue = getCopyCmdQueue();
-    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
-                     PatternSize, Size, nullptr, 0, nullptr);
-    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
-    CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
-                     nullptr);
-    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
-    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                      PatternSize, Size, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
   }
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
                                int64_t PatternSize, int64_t Size,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) {
   // TODO: support async version
-  // TODO: convert enqueueMemFill to return Error code
-  if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
-    return Plugin::success();
-
-  return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+  return enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size);
 }
 
 Expected<void *> L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index d8fcaa90694c9..21613c62f5964 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -280,8 +280,9 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
     void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
 
     if (ZeroInit) {
-      auto RC = Allocator->enqueueMemSet(Base, 0, BlockSize);
-      if (RC != OFFLOAD_SUCCESS) {
+      auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
+      if (Err) {
+        consumeError(std::move(Err));
         DP("Failed to zero-initialize pool memory\n");
         return nullptr;
       }
@@ -572,12 +573,11 @@ Error MemAllocatorTy::deallocLocked(void *Ptr) {
   return Plugin::success();
 }
 
-int32_t MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) {
+Error MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) {
   return Device->enqueueMemFill(Dst, &Value, sizeof(int8_t), Size);
 }
 
-int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
-                                       size_t Size) {
+Error MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size) {
   return Device->enqueueMemCopy(Dst, Src, Size);
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 2c3d56b68f7ae..35748d9a627b6 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -87,9 +87,9 @@ void L0ProgramTy::setLibModule() {
 #endif // _WIN32
 }
 
-int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
-                               const std::string_view CommonBuildOptions,
-                               ze_module_format_t Format) {
+Error L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+                             const std::string_view CommonBuildOptions,
+                             ze_module_format_t Format) {
   const ze_module_constants_t SpecConstants =
       LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
   auto &l0Device = getL0Device();
@@ -105,27 +105,21 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
   ModuleDesc.format = Format;
   ze_module_handle_t Module = nullptr;
   ze_module_build_log_handle_t BuildLog = nullptr;
-  ze_result_t RC;
 
   // Build a single module from a single image
   ModuleDesc.inputSize = Size;
   ModuleDesc.pInputModule = Image;
   ModuleDesc.pBuildFlags = BuildOptions.c_str();
   ModuleDesc.pConstants = &SpecConstants;
-  CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
-             l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
-
-  const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
-
-  if (BuildFailed)
-    return OFFLOAD_FAIL;
+  CALL_ZE_RET_ERROR(zeModuleCreate, l0Device.getZeContext(),
+                    l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
 
   // Check if module link is required. We do not need this check for
   // library module
   if (!RequiresModuleLink && !IsLibModule) {
     ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
                                          nullptr, 0};
-    CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+    CALL_ZE_RET_ERROR(zeModuleGetProperties, Module, &Properties);
     RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
   }
   // For now, assume the first module contains libraries, globals.
@@ -133,28 +127,26 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
     GlobalModule = Module;
   Modules.push_back(Module);
   l0Device.addGlobalModule(Module);
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
-int32_t L0ProgramTy::linkModules() {
+Error L0ProgramTy::linkModules() {
   auto &l0Device = getL0Device();
   if (!RequiresModuleLink) {
     DP("Module link is not required\n");
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
   if (Modules.empty()) {
-    DP("Invalid number of modules when linking modules\n");
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::UNKNOWN,
+                         "Invalid number of modules when linking modules");
   }
 
-  ze_result_t RC;
   ze_module_build_log_handle_t LinkLog = nullptr;
-  CALL_ZE_RC(RC, zeModuleDynamicLink,
-             static_cast<uint32_t>(l0Device.getNumGlobalModules()),
-             l0Device.getGlobalModulesArray(), &LinkLog);
-  const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
-  return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+  CALL_ZE_RET_ERROR(zeModuleDynamicLink,
+                    static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+                    l0Device.getGlobalModulesArray(), &LinkLog);
+  return Plugin::success();
 }
 
 size_t L0ProgramTy::readFile(const char *FileName,
@@ -266,7 +258,7 @@ bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
   return Res;
 }
 
-int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
+Error L0ProgramTy::buildModules(const std::string_view BuildOptions) {
   auto &l0Device = getL0Device();
   auto Image = getMemoryBuffer();
   if (identify_magic(Image.getBuffer()) == file_magic::spirv_object) {
@@ -279,7 +271,7 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
   uint64_t MajorVer, MinorVer;
   if (!isValidOneOmpImage(Image.getBuffer(), MajorVer, MinorVer)) {
     DP("Warning: image is not a valid oneAPI OpenMP image.\n");
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::UNKNOWN, "Invalid oneAPI OpenMP image");
   }
 
   setLibModule();
@@ -476,21 +468,17 @@ int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
           reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
       size_t ImgSize = It->second.PartSize[I];
 
-      auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
-
-      if (RC != OFFLOAD_SUCCESS) {
-        DP("Error: failed to create program from %s "
-           "(%" PRIu64 "-%zu).\n",
-           IsBinary ? "Binary" : "SPIR-V", Idx, I);
-        return OFFLOAD_FAIL;
-      }
+      DP("Creating module from %s image part #%" PRIu64 "-%zu.\n",
+         IsBinary ? "Binary" : "SPIR-V", Idx, I);
+      if (auto Err = addModule(ImgSize, ImgBegin, Options, ModuleFormat))
+        return Err;
     }
     DP("Created module from image #%" PRIu64 ".\n", Idx);
 
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
-  return OFFLOAD_FAIL;
+  return Plugin::error(ErrorCode::UNKNOWN, "Failed to create program modules.");
 }
 
 void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
@@ -514,52 +502,52 @@ void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
   return nullptr;
 }
 
-int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
-                                        void *HostPtr) {
+Error L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+                                      void *HostPtr) {
   size_t SizeDummy = 0;
   void *DevicePtr = nullptr;
   ze_result_t RC;
   CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
           &DevicePtr);
   if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
-    DP("Warning: cannot read from device global variable %s\n", Name);
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Cannot read from device global variable %s", Name);
   }
   return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
 }
 
-int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
-                                         const void *HostPtr) {
+Error L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+                                       const void *HostPtr) {
   size_t SizeDummy = 0;
   void *DevicePtr = nullptr;
   ze_result_t RC;
   CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
           &DevicePtr);
   if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
-    DP("Warning: cannot write to device global variable %s\n", Name);
-    return OFFLOAD_FAIL;
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Cannot write to device global variable %s", Name);
   }
   return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
 }
 
-int32_t L0ProgramTy::loadModuleKernels() {
+Error L0ProgramTy::loadModuleKernels() {
   // We need to build kernels here before filling the offload entries since we
   // don't know which module contains a specific kernel with a name.
   for (auto Module : Modules) {
     uint32_t Count = 0;
-    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+    CALL_ZE_RET_ERROR(zeModuleGetKernelNames, Module, &Count, nullptr);
     if (Count == 0)
       continue;
 
     llvm::SmallVector<const char *> Names(Count);
-    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+    CALL_ZE_RET_ERROR(zeModuleGetKernelNames, Module, &Count, Names.data());
 
     for (auto *Name : Names) {
       KernelsToModuleMap.emplace(Name, Module);
     }
   }
 
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 } // namespace llvm::omp::target::plugin

>From 1221006caa5b0272927ea8be77de864d0f996a87 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 18:13:18 +0200
Subject: [PATCH 52/70] Remove legacy error checking Pt 2

---
 .../level_zero/include/L0Device.h             |  2 +-
 .../level_zero/include/L0Kernel.h             | 24 ++---
 .../level_zero/include/L0Memory.h             |  3 +-
 .../level_zero/include/L0Program.h            |  2 +-
 .../level_zero/src/L0Device.cpp               | 21 ++---
 .../level_zero/src/L0Kernel.cpp               | 94 +++++++++----------
 .../level_zero/src/L0Memory.cpp               | 37 +++++---
 .../level_zero/src/L0Program.cpp              | 21 ++---
 8 files changed, 103 insertions(+), 101 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 0406f856de9c3..3519c1fb76d52 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -549,7 +549,7 @@ class L0DeviceTy final : public GenericDeviceTy {
     return getDeviceMemAllocator();
   }
 
-  int32_t makeMemoryResident(void *Mem, size_t Size);
+  Error makeMemoryResident(void *Mem, size_t Size);
 
   // Generic device interface implementation
   Expected<DeviceImageTy *>
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index ba25ab3abdabf..ceabe88c618ef 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -110,9 +110,9 @@ class L0KernelTy : public GenericKernelTy {
   KernelPropertiesTy Properties;
   auto &getProperties() { return Properties; }
 
-  int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
-                              KernelLaunchParamsTy LaunchParams,
-                              __tgt_async_info *AsyncInfo) const;
+  Error runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+                            KernelLaunchParamsTy LaunchParams,
+                            __tgt_async_info *AsyncInfo) const;
 
   void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
                                   uint32_t ThreadLimit,
@@ -122,10 +122,12 @@ class L0KernelTy : public GenericKernelTy {
                                   bool HalfNumThreads,
                                   bool IsTeamsNDRange) const;
 
-  int32_t decideLoopKernelGroupArguments(
-      L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
-      uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
-      bool &AllowCooperative) const;
+  Error decideLoopKernelGroupArguments(L0DeviceTy &Device, uint32_t ThreadLimit,
+                                       TgtNDRangeDescTy *LoopLevels,
+                                       uint32_t *GroupSizes,
+                                       ze_group_count_t &GroupCounts,
+                                       bool HalfNumThreads,
+                                       bool &AllowCooperative) const;
 
   Error buildKernel(L0ProgramTy &Program);
 
@@ -160,10 +162,10 @@ class L0KernelTy : public GenericKernelTy {
 
   ze_kernel_handle_t getZeKernel() const { return zeKernel; }
 
-  int32_t getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
-                         int32_t ThreadLimit, uint32_t *GroupSizes,
-                         ze_group_count_t &GroupCounts, void *LoopDesc,
-                         bool &AllowCooperative) const;
+  Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
+                       int32_t ThreadLimit, uint32_t *GroupSizes,
+                       ze_group_count_t &GroupCounts, void *LoopDesc,
+                       bool &AllowCooperative) const;
 };
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 444ead6ea4a76..a457247c36256 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -329,7 +329,8 @@ class MemAllocatorTy {
   /// Allocate memory from L0 GPU RT. We use over-allocation workaround
   /// to support target pointer with offset, and positive "ActiveSize" is
   /// specified in such cases for correct debug logging.
-  void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+  Expected<void *> allocL0(size_t Size, size_t Align, int32_t Kind,
+                           size_t ActiveSize = 0);
 
   /// Allocate memory with the specified information from a memory pool
   Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index 15b9255f46279..ace4e20ccfc36 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -109,7 +109,7 @@ class L0ProgramTy : public DeviceImageTy {
   /// \p Name and \p Size in the device environment for the current device.
   /// The lookup is first done via the device offload table. If it fails,
   /// then the lookup falls back to non-OpenMP specific lookup on the device.
-  void *getOffloadVarDeviceAddr(const char *Name) const;
+  Expected<void *> getOffloadVarDeviceAddr(const char *Name) const;
 
   /// Returns the handle of a module that contains a given Kernel name
   ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 07bce14442edf..28c0eaa12f92d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -255,13 +255,13 @@ L0DeviceTy::loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
   auto &Program = addProgram(ImageId, std::move(TgtImage));
 
   if (auto Err = Program.buildModules(CompilationOptions))
-    return Err;
+    return std::move(Err);
 
   if (auto Err = Program.linkModules())
-    return Err;
+    return std::move(Err);
 
   if (auto Err = Program.loadModuleKernels())
-    return Err;
+    return std::move(Err);
 
   return &Program;
 }
@@ -787,17 +787,10 @@ Error L0DeviceTy::dataDelete(void *Ptr) {
   return Allocator.dealloc(Ptr);
 }
 
-int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
-  ze_result_t RC;
-  CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
-          Size);
-  if (RC != ZE_RESULT_SUCCESS) {
-    DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
-       ".\n",
-       DPxPTR(Mem), DPxPTR(getZeDevice()));
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
+Error L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+  CALL_ZE_RET_ERROR(zeContextMakeMemoryResident, getZeContext(), getZeDevice(),
+                    Mem, Size);
+  return Plugin::success();
 }
 
 // Command queues related functions
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 53642eba20475..f77a9d3f4b0e4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -24,12 +24,8 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                              AsyncInfoWrapperTy &AsyncInfoWrapper) const {
 
   auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
-  int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
-                                   std::move(LaunchParams), AsyncInfoWrapper);
-  if (RC == OFFLOAD_SUCCESS)
-    return Plugin::success();
-  return Plugin::error(error::ErrorCode::UNKNOWN,
-                       "Error in launch Kernel %s: %d", getName(), RC);
+  return runTargetTeamRegion(l0Device, KernelArgs, std::move(LaunchParams),
+                             AsyncInfoWrapper);
 }
 
 Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
@@ -222,7 +218,7 @@ static uint64_t computeThreadsNeeded(const llvm::ArrayRef<size_t> TripCounts,
   return GroupCount[0] * ThreadsPerWG;
 }
 
-int32_t L0KernelTy::decideLoopKernelGroupArguments(
+Error L0KernelTy::decideLoopKernelGroupArguments(
     L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
     uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
     bool &AllowCooperative) const {
@@ -282,7 +278,7 @@ int32_t L0KernelTy::decideLoopKernelGroupArguments(
         INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
              "Invalid number of teams %zu due to large loop trip count\n",
              DistributeTripCount);
-        return OFFLOAD_FAIL;
+        return Plugin::success();
       }
       GRPCounts[DistributeDim] = DistributeTripCount;
     }
@@ -290,7 +286,7 @@ int32_t L0KernelTy::decideLoopKernelGroupArguments(
     GroupCounts.groupCountX = GRPCounts[0];
     GroupCounts.groupCountY = GRPCounts[1];
     GroupCounts.groupCountZ = GRPCounts[2];
-    return OFFLOAD_SUCCESS;
+    return Plugin::success();
   }
 
   if (!MaxGroupSizeForced) {
@@ -355,9 +351,10 @@ int32_t L0KernelTy::decideLoopKernelGroupArguments(
       GRPSizes[I] = Trip;
     size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
     if (Count > UINT32_MAX) {
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "Invalid number of teams %zu due to large loop trip count\n", Count);
-      return OFFLOAD_FAIL;
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "Invalid number of teams %zu due to large loop "
+                           "trip count\n",
+                           Count);
     }
     GRPCounts[I] = (uint32_t)Count;
   }
@@ -367,14 +364,13 @@ int32_t L0KernelTy::decideLoopKernelGroupArguments(
   GroupCounts.groupCountZ = GRPCounts[2];
   std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
 
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
-int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
-                                   int32_t ThreadLimit, uint32_t *GroupSizes,
-                                   ze_group_count_t &GroupCounts,
-                                   void *LoopDesc,
-                                   bool &AllowCooperative) const {
+Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
+                                 int32_t ThreadLimit, uint32_t *GroupSizes,
+                                 ze_group_count_t &GroupCounts, void *LoopDesc,
+                                 bool &AllowCooperative) const {
 
   const auto DeviceId = Device.getDeviceId();
   const auto &KernelPR = getProperties();
@@ -443,13 +439,13 @@ int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
     AllowCooperative = false;
   }
 
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
-int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
-                                        KernelArgsTy &KernelArgs,
-                                        KernelLaunchParamsTy LaunchParams,
-                                        __tgt_async_info *AsyncInfo) const {
+Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+                                      KernelArgsTy &KernelArgs,
+                                      KernelLaunchParamsTy LaunchParams,
+                                      __tgt_async_info *AsyncInfo) const {
   // Libomptarget can pass negative NumTeams and ThreadLimit now after
   // introducing __tgt_target_kernel. This happens only when we have valid
   // LoopDesc and the region is not a teams region.
@@ -500,13 +496,9 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
       GroupSizes, GroupCounts, AllowCooperative);
 
   if (!GroupParamsReused) {
-    auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
-                             GroupCounts, LoopDesc, AllowCooperative);
-
-    if (RC != OFFLOAD_SUCCESS) {
-      return RC;
-    }
-
+    if (auto Err = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+                                  GroupCounts, LoopDesc, AllowCooperative))
+      return Err;
     KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
                               NumTeams, ThreadLimit, GroupSizes, GroupCounts,
                               AllowCooperative);
@@ -522,8 +514,8 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   for (int32_t I = 0; I < NumArgs; I++) {
     {
       void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
-      CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
-                       Arg == nullptr ? nullptr : &Arg);
+      CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+                        Arg == nullptr ? nullptr : &Arg);
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
            "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
            ") was set successfully for device %s.\n",
@@ -540,14 +532,14 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   if (PrevFlags != Flags) {
     // Combine with common access flags
     const auto FinalFlags = Device.getIndirectFlags() | Flags;
-    CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+    CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
     DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
     PrevFlags = Flags;
   }
 
   if (!GroupParamsReused) {
-    CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
-                     GroupSizes[1], GroupSizes[2]);
+    CALL_ZE_RET_ERROR(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+                      GroupSizes[1], GroupSizes[2]);
   }
 
   ze_command_list_handle_t CmdList = nullptr;
@@ -580,36 +572,36 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Kernel depends on %zu data copying events.\n", NumWaitEvents);
     if (AllowCooperative)
-      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                       zeKernel, &GroupCounts, Event, NumWaitEvents,
-                       WaitEvents);
+      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                        zeKernel, &GroupCounts, Event, NumWaitEvents,
+                        WaitEvents);
     else
-      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                       &GroupCounts, Event, NumWaitEvents, WaitEvents);
+      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                        &GroupCounts, Event, NumWaitEvents, WaitEvents);
     KernelLock.unlock();
     if (IsAsync) {
       AsyncQueue->WaitEvents.push_back(Event);
       AsyncQueue->KernelEvent = Event;
     } else {
-      CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+      CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
       Device.releaseEvent(Event);
     }
   } else {
     ze_event_handle_t Event = nullptr;
     if (AllowCooperative)
-      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                       zeKernel, &GroupCounts, Event, 0, nullptr);
+      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                        zeKernel, &GroupCounts, Event, 0, nullptr);
     else
-      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                       &GroupCounts, Event, 0, nullptr);
+      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                        &GroupCounts, Event, 0, nullptr);
     KernelLock.unlock();
-    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
-    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
-                         CmdQueue, 1, &CmdList, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+                          CmdQueue, 1, &CmdList, nullptr);
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
-    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
-    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
     if (Event) {
       Device.releaseEvent(Event);
     }
@@ -619,7 +611,7 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
        "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
        IdStr);
 
-  return OFFLOAD_SUCCESS;
+  return Plugin::success();
 }
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 21613c62f5964..7727df4725a41 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -83,7 +83,10 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
 
   // Check page size used for this allocation kind to decide minimum
   // allocation size when allocating from L0.
-  void *Mem = Allocator->allocL0(8, 0, AllocKind);
+  auto MemOrErr = Allocator->allocL0(8, 0, AllocKind);
+  if (!MemOrErr)
+    FATAL_MESSAGE0(0, "Failed to allocate memory pool\n");
+  void *Mem = *MemOrErr;
   ze_memory_allocation_properties_t AP{
       ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
       ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
@@ -277,7 +280,14 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
     // Bucket is empty or all blocks in the bucket are full
     const auto ChunkSize = BucketParams[BucketId].first;
     const auto BlockSize = BucketParams[BucketId].second;
-    void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+    auto BaseOrErr = Allocator->allocL0(BlockSize, 0, AllocKind);
+    if (!BaseOrErr) {
+      consumeError(BaseOrErr.takeError());
+      DP("Failed to allocate new block for %s pool\n",
+         AllocKindToStr(AllocKind));
+      return nullptr;
+    }
+    void *Base = *BaseOrErr;
 
     if (ZeroInit) {
       auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
@@ -515,7 +525,10 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
     }
   }
 
-  AllocBase = allocL0(AllocSize, Align, Kind, Size);
+  auto AllocBaseOrErr = allocL0(AllocSize, Align, Kind, Size);
+  if (!AllocBaseOrErr)
+    return AllocBaseOrErr.takeError();
+  AllocBase = *AllocBaseOrErr;
   if (AllocBase) {
     Mem = (void *)((uintptr_t)AllocBase + Offset);
     AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
@@ -581,8 +594,8 @@ Error MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size) {
   return Device->enqueueMemCopy(Dst, Src, Size);
 }
 
-void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
-                              size_t ActiveSize) {
+Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
+                                         int32_t Kind, size_t ActiveSize) {
   void *Mem = nullptr;
   ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
                                         nullptr, 0, 0};
@@ -604,17 +617,17 @@ void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
   switch (Kind) {
   case TARGET_ALLOC_DEVICE:
     makeResident = true;
-    CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
-                     zeDevice, &Mem);
+    CALL_ZE_RET_ERROR(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+                      zeDevice, &Mem);
     DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
     break;
   case TARGET_ALLOC_HOST:
-    CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+    CALL_ZE_RET_ERROR(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
     DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
     break;
   case TARGET_ALLOC_SHARED:
-    CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
-                     Align, zeDevice, &Mem);
+    CALL_ZE_RET_ERROR(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+                      Align, zeDevice, &Mem);
     DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
     break;
   default:
@@ -626,8 +639,10 @@ void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
   if (makeResident) {
     assert(Device &&
            "Device is not set for memory allocation. Is this a Device Pool?");
-    if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+    if (auto Err = Device->makeMemoryResident(Mem, Size)) {
       Mem = nullptr;
+      return std::move(Err);
+    }
   }
   return Mem;
 }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 35748d9a627b6..fdcad148667db 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -31,14 +31,12 @@ Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
   const char *GlobalName = DeviceGlobal.getName().data();
 
   L0ProgramTy &Program = L0ProgramTy::makeL0Program(Image);
-  void *Addr = Program.getOffloadVarDeviceAddr(GlobalName);
+  auto AddrOrErr = Program.getOffloadVarDeviceAddr(GlobalName);
+  if (!AddrOrErr)
+    return AddrOrErr.takeError();
 
   // Save the pointer to the symbol allowing nullptr.
-  DeviceGlobal.setPtr(Addr);
-
-  if (Addr == nullptr)
-    return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
-                         GlobalName);
+  DeviceGlobal.setPtr(*AddrOrErr);
 
   return Plugin::success();
 }
@@ -481,11 +479,12 @@ Error L0ProgramTy::buildModules(const std::string_view BuildOptions) {
   return Plugin::error(ErrorCode::UNKNOWN, "Failed to create program modules.");
 }
 
-void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+Expected<void *> L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
   DP("Looking up OpenMP global variable '%s'.\n", CName);
 
   if (!GlobalModule || !CName)
-    return nullptr;
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid arguments to getOffloadVarDeviceAddr");
 
   std::string Name(CName);
   size_t SizeDummy = 0;
@@ -497,9 +496,9 @@ void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
     if (RC == ZE_RESULT_SUCCESS && DevicePtr)
       return DevicePtr;
   }
-  DP("Warning: global variable '%s' was not found in the device.\n",
-     Name.c_str());
-  return nullptr;
+  return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                       "Global variable '%s' not found on device",
+                       Name.c_str());
 }
 
 Error L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,

>From 45d38a21cbe6c2f89de027ace86e29f05583c5fa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 20:31:12 +0200
Subject: [PATCH 53/70] Remove legacy error checking Pt 3

---
 .../level_zero/include/L0Context.h            | 14 +++-----
 .../level_zero/include/L0Defs.h               | 10 ------
 .../level_zero/include/L0Device.h             | 15 +++-----
 .../level_zero/include/L0Memory.h             | 14 ++++----
 .../level_zero/include/L0Plugin.h             |  2 +-
 .../level_zero/src/L0Context.cpp              | 23 +++++++++----
 .../level_zero/src/L0Device.cpp               | 34 +++++++++++--------
 .../level_zero/src/L0Kernel.cpp               | 11 ++++--
 .../level_zero/src/L0Memory.cpp               | 11 +++---
 .../level_zero/src/L0Plugin.cpp               | 12 ++++---
 10 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 340df41e33cef..7248ddd78d317 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -71,7 +71,8 @@ class L0ContextTy {
 
   /// Create context, initialize event pool and extension functions
   L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
-              int32_t DriverId);
+              int32_t DriverId)
+      : Plugin(Plugin), zeDriver(zeDriver) {}
 
   L0ContextTy(const L0ContextTy &) = delete;
   L0ContextTy(L0ContextTy &&) = delete;
@@ -81,15 +82,8 @@ class L0ContextTy {
   /// Release resources
   ~L0ContextTy() {}
 
-  Error deinit() {
-    EventPool.deinit();
-    auto Err = HostMemAllocator.deinit();
-    if (Err)
-      return Err;
-    if (zeContext)
-      CALL_ZE_RET_ERROR(zeContextDestroy, zeContext);
-    return Error::success();
-  }
+  Error init();
+  Error deinit();
 
   auto &getPlugin() const { return Plugin; }
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index ed5c08730056f..1036cc9fe0cca 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -42,16 +42,6 @@ using namespace error;
 /// Generic L0 handle type
 using ZeHandleTy = void *;
 
-template <typename... ArgsTy>
-static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
-
-  if (Code == OFFLOAD_SUCCESS)
-    return Plugin::success();
-  const char *Desc = "Unknown error";
-  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
-                                                    ErrFmt, Args..., Desc);
-}
-
 } // namespace llvm::omp::target::plugin
 
 #endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 3519c1fb76d52..2e2ea473f8da8 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -267,8 +267,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
   std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
 
-  Error internalInit();
-
 public:
   L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
              ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
@@ -284,11 +282,6 @@ class L0DeviceTy final : public GenericDeviceTy {
     MemoryProperties.pNext = nullptr;
     CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
     CacheProperties.pNext = nullptr;
-
-    auto Err = internalInit();
-    if (Err)
-      FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
-                    toString(std::move(Err)).c_str());
   }
 
   static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
@@ -507,11 +500,13 @@ class L0DeviceTy final : public GenericDeviceTy {
   }
 
   /// Return an event from the driver associated to this device
-  ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+  Expected<ze_event_handle_t> getEvent() {
+    return l0Context.getEventPool().getEvent();
+  }
 
   /// Release event to the pool associated to this device
-  void releaseEvent(ze_event_handle_t Event) {
-    l0Context.getEventPool().releaseEvent(Event, *this);
+  Error releaseEvent(ze_event_handle_t Event) {
+    return l0Context.getEventPool().releaseEvent(Event, *this);
   }
 
   StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index a457247c36256..c8371ce4daf47 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -464,25 +464,27 @@ class EventPoolTy {
 
 public:
   /// Initialize context, flags, and mutex
-  void init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
+  Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
     Context = ContextIn;
     Flags = FlagsIn;
     Mtx.reset(new std::mutex);
+    return Plugin::success();
   }
 
   /// Destroys L0 resources
-  void deinit() {
+  Error deinit() {
     for (auto E : Events)
-      CALL_ZE_RET_VOID(zeEventDestroy, E);
+      CALL_ZE_RET_ERROR(zeEventDestroy, E);
     for (auto P : Pools)
-      CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+      CALL_ZE_RET_ERROR(zeEventPoolDestroy, P);
+    return Plugin::success();
   }
 
   /// Get a free event from the pool
-  ze_event_handle_t getEvent();
+  Expected<ze_event_handle_t> getEvent();
 
   /// Return an event to the pool
-  void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+  Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
 };
 
 /// Staging buffer
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 971e665b2954f..f718486035f13 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -81,7 +81,7 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 
   /// Find L0 devices and initialize device properties.
   /// Returns number of devices reported to omptarget.
-  int32_t findDevices();
+  Expected<int32_t> findDevices();
 
   L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
     assert("Invalid device ID" && DeviceId >= 0 &&
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
index 3f50ffd2a7260..881c84a5a0972 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Context.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -15,17 +15,26 @@
 
 namespace llvm::omp::target::plugin {
 
-L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
-                         int32_t /*DriverId*/)
-    : Plugin(Plugin), zeDriver(zeDriver) {
-  CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+Error L0ContextTy::init() {
+  CALL_ZE_RET_ERROR(zeDriverGetApiVersion, zeDriver, &APIVersion);
   DP("Driver API version is %" PRIx32 "\n", APIVersion);
 
   ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
-  CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
-
-  EventPool.init(zeContext, 0);
+  CALL_ZE_RET_ERROR(zeContextCreate, zeDriver, &Desc, &zeContext);
+  if (auto Err = EventPool.init(zeContext, 0))
+    return Err;
   HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+  return Plugin::success();
+}
+
+Error L0ContextTy::deinit() {
+  if (auto Err = EventPool.deinit())
+    return Err;
+  if (auto Err = HostMemAllocator.deinit())
+    return Err;
+  if (zeContext)
+    CALL_ZE_RET_ERROR(zeContextDestroy, zeContext);
+  return Plugin::success();
 }
 
 StagingBufferTy &L0ContextTy::getStagingBuffer() {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 28c0eaa12f92d..3e648a2e11647 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -174,7 +174,7 @@ void L0DeviceTy::reportDeviceInfo() const {
   DP("-- Max clock frequency (MHz)    : %" PRIu32 "\n", getClockRate());
 }
 
-Error L0DeviceTy::internalInit() {
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
   const auto &Options = getPlugin().getOptions();
 
   uint32_t Count = 1;
@@ -219,10 +219,6 @@ Error L0DeviceTy::internalInit() {
   return Plugin::success();
 }
 
-Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
-  return Plugin::success();
-}
-
 Error L0DeviceTy::deinitImpl() {
   for (auto &PGM : Programs)
     if (auto Err = PGM.deinit())
@@ -293,7 +289,8 @@ Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
         CALL_ZE_RET_ERROR(zeEventHostSynchronize, KE, UINT64_MAX);
       }
       for (auto &Event : WaitEvents) {
-        releaseEvent(Event);
+        if (auto Err = releaseEvent(Event))
+          return Err;
       }
     } else { // Async
       // Wait for all events. We should wait and reset events in reverse order
@@ -308,7 +305,8 @@ Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
           if (*Itr == AsyncQueue->KernelEvent)
             WaitDone = true;
         }
-        releaseEvent(*Itr);
+        if (auto Err = releaseEvent(*Itr))
+          return Err;
       }
     }
   }
@@ -507,13 +505,13 @@ Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
   const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
 
   if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
-    if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
-                            (__tgt_async_info *)AsyncInfoWrapper))
-      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+    if (auto Err = enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+                                       (__tgt_async_info *)AsyncInfoWrapper))
+      return Err;
   } else {
-    if (enqueueMemCopy(DstPtr, SrcPtr, Size,
-                       /* AsyncInfo */ nullptr, UseCopyEngine))
-      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+    if (auto Err = enqueueMemCopy(DstPtr, SrcPtr, Size,
+                                  /* AsyncInfo */ nullptr, UseCopyEngine))
+      return Err;
   }
   return Plugin::success();
 }
@@ -708,7 +706,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
                                       bool CopyTo) {
   const bool Ordered =
       (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
-  ze_event_handle_t SignalEvent = getEvent();
+  auto EventOrErr = getEvent();
+  if (!EventOrErr)
+    return EventOrErr.takeError();
+  ze_event_handle_t SignalEvent = *EventOrErr;
   size_t NumWaitEvents = 0;
   ze_event_handle_t *WaitEvents = nullptr;
   AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
@@ -734,7 +735,10 @@ Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
                                  size_t PatternSize, size_t Size) {
   if (useImmForCopy()) {
     const auto CmdList = getImmCopyCmdList();
-    auto Event = getEvent();
+    auto EventOrErr = getEvent();
+    if (!EventOrErr)
+      return EventOrErr.takeError();
+    ze_event_handle_t Event = *EventOrErr;
     CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
                       PatternSize, Size, Event, 0, nullptr);
     CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index f77a9d3f4b0e4..61f659d995ec0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -557,7 +557,10 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   if (UseImmCmdList) {
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Using immediate command list for kernel submission.\n");
-    auto Event = Device.getEvent();
+    auto EventOrError = Device.getEvent();
+    if (!EventOrError)
+      return EventOrError.takeError();
+    ze_event_handle_t Event = *EventOrError;
     size_t NumWaitEvents = 0;
     ze_event_handle_t *WaitEvents = nullptr;
     if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
@@ -584,7 +587,8 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
       AsyncQueue->KernelEvent = Event;
     } else {
       CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
-      Device.releaseEvent(Event);
+      if (auto Err = Device.releaseEvent(Event))
+        return Err;
     }
   } else {
     ze_event_handle_t Event = nullptr;
@@ -603,7 +607,8 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
     CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
     CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
     if (Event) {
-      Device.releaseEvent(Event);
+      if (auto Err = Device.releaseEvent(Event))
+        return Err;
     }
   }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 7727df4725a41..1b5c8a48924f3 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -647,7 +647,7 @@ Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
   return Mem;
 }
 
-ze_event_handle_t EventPoolTy::getEvent() {
+Expected<ze_event_handle_t> EventPoolTy::getEvent() {
   std::lock_guard<std::mutex> Lock(*Mtx);
 
   if (Events.empty()) {
@@ -656,7 +656,7 @@ ze_event_handle_t EventPoolTy::getEvent() {
     Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
     Desc.count = PoolSize;
     ze_event_pool_handle_t Pool;
-    CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+    CALL_ZE_RET_ERROR(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
     Pools.push_back(Pool);
 
     // Create events
@@ -666,7 +666,7 @@ ze_event_handle_t EventPoolTy::getEvent() {
     for (uint32_t I = 0; I < PoolSize; I++) {
       EventDesc.index = I;
       ze_event_handle_t Event;
-      CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+      CALL_ZE_RET_ERROR(zeEventCreate, Pool, &EventDesc, &Event);
       Events.push_back(Event);
     }
   }
@@ -678,10 +678,11 @@ ze_event_handle_t EventPoolTy::getEvent() {
 }
 
 /// Return an event to the pool
-void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+Error EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
   std::lock_guard<std::mutex> Lock(*Mtx);
-  CALL_ZE_RET_VOID(zeEventHostReset, Event);
+  CALL_ZE_RET_ERROR(zeEventHostReset, Event);
   Events.push_back(Event);
+  return Plugin::success();
 }
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 3df7b1e168712..046ac9bdadd86 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -27,10 +27,10 @@ using namespace error;
 // Common data across all possible plugin instantiations
 L0OptionsTy LevelZeroPluginTy::Options;
 
-int32_t LevelZeroPluginTy::findDevices() {
-  CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+Expected<int32_t> LevelZeroPluginTy::findDevices() {
+  CALL_ZE_RET_ERROR(zeInit, ZE_INIT_FLAG_GPU_ONLY);
   uint32_t NumDrivers = 0;
-  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+  CALL_ZE_RET_ERROR(zeDriverGet, &NumDrivers, nullptr);
   if (NumDrivers == 0) {
     DP("Cannot find any drivers.\n");
     return 0;
@@ -39,7 +39,7 @@ int32_t LevelZeroPluginTy::findDevices() {
   // We expect multiple drivers on Windows to support different device types,
   // so we need to maintain multiple drivers and contexts in general.
   llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
-  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+  CALL_ZE_RET_ERROR(zeDriverGet, &NumDrivers, FoundDrivers.data());
 
   struct RootInfoTy {
     uint32_t OrderId;
@@ -62,8 +62,10 @@ int32_t LevelZeroPluginTy::findDevices() {
     // We have a driver that supports at least one device
     ContextList.emplace_back(*this, Driver, DriverId);
     auto &DrvInfo = ContextList.back();
+    if (auto Err = DrvInfo.init())
+      return std::move(Err);
     llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
-    CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+    CALL_ZE_RET_ERROR(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
 
     for (auto &zeDevice : FoundDevices)
       RootDevices.push_back(

>From 6ec68e75e9f2b98988b3cd82817441e74dd8fe27 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 21 Oct 2025 23:16:21 +0200
Subject: [PATCH 54/70] Remove legacy error checking Pt 4

---
 .../level_zero/include/L0Memory.h             |  24 ++---
 .../level_zero/src/L0Context.cpp              |   3 +-
 .../level_zero/src/L0Device.cpp               |   3 +-
 .../level_zero/src/L0Memory.cpp               | 102 ++++++++++--------
 4 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index c8371ce4daf47..8a6955699139f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -205,15 +205,6 @@ class MemAllocatorTy {
 
   public:
     MemPoolTy() = default;
-
-    /// Construct pool with allocation kind, allocator, and user options.
-    MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
-              const L0OptionsTy &Option);
-    // Used for reduction pool
-    MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
-    // Used for small memory pool with fixed parameters
-    MemPoolTy(MemAllocatorTy *_Allocator);
-
     MemPoolTy(const MemPoolTy &) = delete;
     MemPoolTy(MemPoolTy &&) = delete;
     MemPoolTy &operator=(const MemPoolTy &) = delete;
@@ -221,12 +212,21 @@ class MemAllocatorTy {
     ~MemPoolTy() {}
 
     void printUsage();
+
+    /// Initialize pool with allocation kind, allocator, and user options.
+    Error init(int32_t Kind, MemAllocatorTy *_Allocator,
+               const L0OptionsTy &Option);
+    // Initialize pool used for reduction pool
+    Error init(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    // Initialize pool used for small memory pool with fixed parameters
+    Error init(MemAllocatorTy *_Allocator);
+
     /// Release resources used in the pool.
     Error deinit();
 
     /// Allocate the requested size of memory from this pool.
     /// AllocSize is the chunk size internally used for the returned memory.
-    void *alloc(size_t Size, size_t &AllocSize);
+    Expected<void *> alloc(size_t Size, size_t &AllocSize);
     /// Deallocate the specified memory and returns block size deallocated.
     size_t dealloc(void *Ptr);
   }; // MemPoolTy
@@ -322,8 +322,8 @@ class MemAllocatorTy {
   /// Allocator only supports host memory
   bool supportsHostMem() { return IsHostMem; }
 
-  void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
-  void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+  Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+  Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
   void updateMaxAllocSize(L0DeviceTy &L0Device);
 
   /// Allocate memory from L0 GPU RT. We use over-allocation workaround
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
index 881c84a5a0972..6583955b082b3 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Context.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -23,7 +23,8 @@ Error L0ContextTy::init() {
   CALL_ZE_RET_ERROR(zeContextCreate, zeDriver, &Desc, &zeContext);
   if (auto Err = EventPool.init(zeContext, 0))
     return Err;
-  HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+  if (auto Err = HostMemAllocator.initHostPool(*this, Plugin.getOptions()))
+    return Err;
   return Plugin::success();
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 3e648a2e11647..b01d5091a58d2 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -214,7 +214,8 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
   LinkCopyOrdinal = findCopyOrdinal(true);
   IsAsyncEnabled =
       isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
-  MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+  if (auto Err = MemAllocator.initDevicePools(*this, getPlugin().getOptions()))
+    return Err;
   l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
   return Plugin::success();
 }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 1b5c8a48924f3..dd26ed55a86fa 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -63,10 +63,10 @@ void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
   FreeSlot = Slot;
 }
 
-MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
-                                     const L0OptionsTy &Option) {
+Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn,
+                                      const L0OptionsTy &Option) {
   AllocKind = Kind;
-  Allocator = _Allocator;
+  Allocator = AllocatorIn;
 
   // Read user-defined options
   const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
@@ -85,14 +85,14 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
   // allocation size when allocating from L0.
   auto MemOrErr = Allocator->allocL0(8, 0, AllocKind);
   if (!MemOrErr)
-    FATAL_MESSAGE0(0, "Failed to allocate memory pool\n");
+    return MemOrErr.takeError();
   void *Mem = *MemOrErr;
   ze_memory_allocation_properties_t AP{
       ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
       ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
-  CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+  CALL_ZE_RET_ERROR(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
   AllocUnit = (std::max)(AP.pageSize, AllocUnit);
-  CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+  CALL_ZE_RET_ERROR(zeMemFree, Context, Mem);
 
   bool IsDiscrete = false;
   if (Device) {
@@ -100,7 +100,8 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
     Properties.deviceId = 0;
     Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
     Properties.pNext = nullptr;
-    CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+    CALL_ZE_RET_ERROR(zeDeviceGetProperties, Device->getZeDevice(),
+                      &Properties);
     IsDiscrete = Device->isDiscreteDevice();
 
     if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
@@ -153,13 +154,14 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
      "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
      AllocKindToStr(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
      BlockCapacity, PoolSizeMax);
+  return Plugin::success();
 }
 
 // Used for reduction pool
-MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
-                                     const L0OptionsTy &Option) {
+Error MemAllocatorTy::MemPoolTy::init(MemAllocatorTy *AllocatorIn,
+                                      const L0OptionsTy &Option) {
   AllocKind = TARGET_ALLOC_DEVICE;
-  Allocator = _Allocator;
+  Allocator = AllocatorIn;
   AllocMin = AllocUnit = 1024 << 6; // 64KB
   AllocMax = Option.ReductionPoolInfo[0] << 20;
   BlockCapacity = Option.ReductionPoolInfo[1];
@@ -178,12 +180,13 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
   DP("Initialized reduction scratch pool for device " DPxMOD
      ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
      DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+  return Plugin::success();
 }
 
 // Used for small memory pool with fixed parameters
-MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+Error MemAllocatorTy::MemPoolTy::init(MemAllocatorTy *AllocatorIn) {
   AllocKind = TARGET_ALLOC_DEVICE;
-  Allocator = _Allocator;
+  Allocator = AllocatorIn;
   AllocMax = AllocMin;
   BlockCapacity = AllocUnit / AllocMax;
   PoolSize = 0;
@@ -195,6 +198,7 @@ MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
   DP("Initialized zero-initialized reduction counter pool for "
      "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
      DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+  return Plugin::success();
 }
 
 void MemAllocatorTy::MemPoolTy::printUsage() {
@@ -254,7 +258,8 @@ Error MemAllocatorTy::MemPoolTy::deinit() {
 
 /// Allocate the requested size of memory from this pool.
 /// AllocSize is the chunk size internally used for the returned memory.
-void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+Expected<void *> MemAllocatorTy::MemPoolTy::alloc(size_t Size,
+                                                  size_t &AllocSize) {
   if (Size == 0 || Size > AllocMax)
     return nullptr;
 
@@ -281,21 +286,15 @@ void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
     const auto ChunkSize = BucketParams[BucketId].first;
     const auto BlockSize = BucketParams[BucketId].second;
     auto BaseOrErr = Allocator->allocL0(BlockSize, 0, AllocKind);
-    if (!BaseOrErr) {
-      consumeError(BaseOrErr.takeError());
-      DP("Failed to allocate new block for %s pool\n",
-         AllocKindToStr(AllocKind));
-      return nullptr;
-    }
+    if (!BaseOrErr)
+      return BaseOrErr.takeError();
+
     void *Base = *BaseOrErr;
 
     if (ZeroInit) {
       auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
-      if (Err) {
-        consumeError(std::move(Err));
-        DP("Failed to zero-initialize pool memory\n");
-        return nullptr;
-      }
+      if (Err)
+        return Err;
     }
 
     BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
@@ -367,54 +366,60 @@ bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
   return true;
 }
 
-void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
-                                     const L0OptionsTy &Option) {
+Error MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+                                      const L0OptionsTy &Options) {
   SupportsLargeMem = L0Device.supportsLargeMem();
   IsHostMem = false;
   Device = &L0Device;
   L0Context = &L0Device.getL0Context();
   for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
-    if (Option.MemPoolInfo.count(Kind) > 0) {
+    if (Options.MemPoolInfo.count(Kind) > 0) {
       std::lock_guard<std::mutex> Lock(Mtx);
-      Pools[Kind] = std::make_unique<MemPoolTy>(Kind, this, Option);
+      Pools[Kind] = std::make_unique<MemPoolTy>();
+      if (auto Err = Pools[Kind]->init(Kind, this, Options))
+        return Err;
     }
   }
-  ReductionPool = std::make_unique<MemPoolTy>(this, Option);
-  CounterPool = std::make_unique<MemPoolTy>(this);
+  ReductionPool = std::make_unique<MemPoolTy>();
+  if (auto Err = ReductionPool->init(this, Options))
+    return Err;
+  CounterPool = std::make_unique<MemPoolTy>();
+  if (auto Err = CounterPool->init(this))
+    return Err;
   updateMaxAllocSize(L0Device);
+  return Plugin::success();
 }
 
-void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
-                                  const L0OptionsTy &Option) {
+Error MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+                                   const L0OptionsTy &Option) {
   SupportsLargeMem = Driver.supportsLargeMem();
   IsHostMem = true;
   this->L0Context = &Driver;
   if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
     std::lock_guard<std::mutex> Lock(Mtx);
-    Pools[TARGET_ALLOC_HOST] =
-        std::make_unique<MemPoolTy>(TARGET_ALLOC_HOST, this, Option);
+    Pools[TARGET_ALLOC_HOST] = std::make_unique<MemPoolTy>();
+    if (auto Err =
+            Pools[TARGET_ALLOC_HOST]->init(TARGET_ALLOC_HOST, this, Option))
+      return Err;
   }
+  return Plugin::success();
 }
 
 void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
   // Update the maximum allocation size for this Allocator
-  ze_device_properties_t P;
-  P.maxMemAllocSize = 0;
-  P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-  P.pNext = nullptr;
-  CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+  auto maxMemAllocSize = L0Device.getMaxMemAllocSize();
 
   if (IsHostMem) {
     // MaxAllocSize should be the minimum of all devices from the driver
-    if (MaxAllocSize > P.maxMemAllocSize) {
-      MaxAllocSize = P.maxMemAllocSize;
+    if (MaxAllocSize > maxMemAllocSize) {
+      MaxAllocSize = maxMemAllocSize;
       DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
          DPxPTR(L0Context), MaxAllocSize);
     }
     return;
   }
 
-  MaxAllocSize = P.maxMemAllocSize;
+  MaxAllocSize = maxMemAllocSize;
   DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
      MaxAllocSize);
 }
@@ -502,12 +507,19 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
     if (Align > 0)
       AllocSize += (Align - 1);
     size_t PoolAllocSize = 0;
+    MemPoolTy *Pool = nullptr;
+
     if (UseScratchPool)
-      AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+      AllocBase = &ReductionPool;
     else if (UseZeroInitPool)
-      AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+      AllocBase = &CounterPool;
     else
-      AllocBase = Pools[Kind]->alloc(AllocSize, PoolAllocSize);
+      AllocBase = Pools[Kind].get();
+
+    auto PtrOrErr = Pool->alloc(AllocSize, PoolAllocSize);
+    if (!PtrOrErr)
+      return PtrOrErr.takeError();
+    AllocBase = *PtrOrErr;
     if (AllocBase) {
       uintptr_t Base = (uintptr_t)AllocBase;
       if (Align > 0)

>From ef8d7940715712ac872b39b229fa1d81b37c643c Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 00:04:46 +0200
Subject: [PATCH 55/70] Add more error checking

---
 offload/include/PerThreadTable.h              |  19 ++
 .../level_zero/include/L0Context.h            |   7 +-
 .../level_zero/include/L0Device.h             |  32 ++--
 .../level_zero/include/L0Memory.h             |  26 +--
 .../level_zero/src/L0Device.cpp               | 177 +++++++++++++-----
 .../level_zero/src/L0Kernel.cpp               |  15 +-
 .../level_zero/src/L0Plugin.cpp               |   3 +-
 7 files changed, 195 insertions(+), 84 deletions(-)

diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index fa4bc230a1fea..1e404fbcd45e4 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -14,6 +14,7 @@
 #define OFFLOAD_PERTHREADTABLE_H
 
 #include <list>
+#include <llvm/Support/Error.h>
 #include <memory>
 #include <mutex>
 #include <type_traits>
@@ -204,6 +205,24 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     }
     ThreadDataList.clear();
   }
+
+  template <class F> llvm::Error deinit(F f) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto ThData : ThreadDataList) {
+      if (!ThData->ThEntry || ThData->NElements == 0)
+        continue;
+      for (auto &Obj : *ThData->ThEntry) {
+        if constexpr (is_associative<ContainerType>::value) {
+          if (auto Err = f(Obj.second))
+            return Err;
+        } else {
+          if (auto Err = f(Obj))
+            return Err;
+        }
+      }
+    }
+    return llvm::Error::success();
+  }
 };
 
 template <typename T, typename = std::void_t<>> struct ContainerValueType {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 7248ddd78d317..8779c703659c4 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -27,14 +27,15 @@ class L0ContextTLSTy {
   auto &getStagingBuffer() { return StagingBuffer; }
   const auto &getStagingBuffer() const { return StagingBuffer; }
 
-  void clear() { StagingBuffer.clear(); }
+  Error deinit() { return StagingBuffer.clear(); }
 };
 
 struct L0ContextTLSTableTy
     : public PerThreadContainer<
           std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
-  void clear() {
-    PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+  Error deinit() {
+    return PerThreadTable::deinit(
+        [](L0ContextTLSTy &Entry) -> auto { return Entry.deinit(); });
   }
 };
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 2e2ea473f8da8..2a845a28f987d 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -423,55 +423,55 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   // Command queues related functions
   /// Create a command list with given ordinal and flags
-  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+  Expected<ze_command_list_handle_t> createCmdList(ze_context_handle_t Context,
                                          ze_device_handle_t Device,
                                          uint32_t Ordinal,
                                          ze_command_list_flags_t Flags,
                                          const std::string_view DeviceIdStr);
 
   /// Create a command list with default flags
-  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+  Expected<ze_command_list_handle_t> createCmdList(ze_context_handle_t Context,
                                          ze_device_handle_t Device,
                                          uint32_t Ordinal,
                                          const std::string_view DeviceIdStr);
 
-  ze_command_list_handle_t getCmdList();
+  Expected<ze_command_list_handle_t> getCmdList();
 
   /// Create a command queue with given ordinal and flags
-  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+  Expected<ze_command_queue_handle_t> createCmdQueue(ze_context_handle_t Context,
                                            ze_device_handle_t Device,
                                            uint32_t Ordinal, uint32_t Index,
                                            ze_command_queue_flags_t Flags,
                                            const std::string_view DeviceIdStr);
 
   /// Create a command queue with default flags
-  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+  Expected<ze_command_queue_handle_t> createCmdQueue(ze_context_handle_t Context,
                                            ze_device_handle_t Device,
                                            uint32_t Ordinal, uint32_t Index,
                                            const std::string_view DeviceIdStr,
                                            bool InOrder = false);
 
   /// Create a new command queue for the given OpenMP device ID
-  ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+  Expected<ze_command_queue_handle_t> createCommandQueue(bool InOrder = false);
 
   /// Create an immediate command list
-  ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+  Expected<ze_command_list_handle_t> createImmCmdList(uint32_t Ordinal, uint32_t Index,
                                             bool InOrder = false);
 
   /// Create an immediate command list for computing
-  ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+  Expected<ze_command_list_handle_t> createImmCmdList(bool InOrder = false) {
     return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
   }
 
   /// Create an immediate command list for copying
-  ze_command_list_handle_t createImmCopyCmdList();
-  ze_command_queue_handle_t getCmdQueue();
-  ze_command_list_handle_t getCopyCmdList();
-  ze_command_queue_handle_t getCopyCmdQueue();
-  ze_command_list_handle_t getLinkCopyCmdList();
-  ze_command_queue_handle_t getLinkCopyCmdQueue();
-  ze_command_list_handle_t getImmCmdList();
-  ze_command_list_handle_t getImmCopyCmdList();
+  Expected<ze_command_list_handle_t> createImmCopyCmdList();
+  Expected<ze_command_queue_handle_t> getCmdQueue();
+  Expected<ze_command_list_handle_t> getCopyCmdList();
+  Expected<ze_command_queue_handle_t> getCopyCmdQueue();
+  Expected<ze_command_list_handle_t> getLinkCopyCmdList();
+  Expected<ze_command_queue_handle_t> getLinkCopyCmdQueue();
+  Expected<ze_command_list_handle_t> getImmCmdList();
+  Expected<ze_command_list_handle_t> getImmCopyCmdList();
 
   /// Enqueue copy command
   Error enqueueMemCopy(void *Dst, const void *Src, size_t Size,
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 8a6955699139f..7229bb90cb7bb 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -502,13 +502,13 @@ class StagingBufferTy {
   /// Next buffer location in the buffers
   size_t Offset = 0;
 
-  void *addBuffers() {
+  Expected<void *> addBuffers() {
     ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
                                        nullptr, 0};
     void *Ret = nullptr;
     size_t AllocSize = Size * Count;
-    CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
-                     L0DefaultAlignment, &Ret);
+    CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+                      L0DefaultAlignment, &Ret);
     Buffers.push_back(Ret);
     return Ret;
   }
@@ -522,12 +522,13 @@ class StagingBufferTy {
 
   ~StagingBufferTy() {}
 
-  void clear() {
+  Error clear() {
     ze_result_t Rc;
     (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
     for (auto Ptr : Buffers)
-      CALL_ZE(Rc, zeMemFree, Context, Ptr);
+      CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr);
     Context = nullptr;
+    return Plugin::success();
   }
 
   bool initialized() const { return Context != nullptr; }
@@ -541,23 +542,26 @@ class StagingBufferTy {
   void reset() { Offset = 0; }
 
   /// Always return the first buffer
-  void *get() {
+  Expected<void *> get() {
     if (Size == 0 || Count == 0)
       return nullptr;
     return Buffers.empty() ? addBuffers() : Buffers.front();
   }
 
   /// Return the next available buffer
-  void *getNext() {
+  Expected<void *> getNext() {
     void *Ret = nullptr;
     if (Size == 0 || Count == 0)
       return Ret;
 
     size_t AllocSize = Size * Count;
     bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
-    if (NeedToGrow)
-      Ret = addBuffers();
-    else
+    if (NeedToGrow) {
+      auto PtrOrErr = addBuffers();
+      if (!PtrOrErr)
+        return PtrOrErr.takeError();
+      Ret = *PtrOrErr;
+    } else
       Ret = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));
 
@@ -569,7 +573,7 @@ class StagingBufferTy {
   }
 
   /// Return either a fixed buffer or next buffer
-  void *get(bool Next) { return Next ? getNext() : get(); }
+  Expected<void *> get(bool Next) { return Next ? getNext() : get(); }
 };
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index b01d5091a58d2..369732682ff68 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -410,7 +410,10 @@ Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
     if (isDiscreteDevice() &&
         static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
         getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
-      SrcPtr = getStagingBuffer().get(IsAsync);
+      auto PtrOrErr = getStagingBuffer().get(IsAsync);
+      if (!PtrOrErr)
+        return PtrOrErr.takeError();
+      SrcPtr = *PtrOrErr;
       std::copy_n(static_cast<const char *>(HstPtr), Size,
                   static_cast<char *>(const_cast<void *>(SrcPtr)));
     }
@@ -469,7 +472,10 @@ Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
         static_cast<size_t>(Size) <=
             getPlugin().getOptions().StagingBufferSize &&
         getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
-      DstPtr = getStagingBuffer().get(IsAsync);
+      auto PtrOrErr = getStagingBuffer().get(IsAsync);
+      if (!PtrOrErr)
+        return PtrOrErr.takeError();
+      DstPtr = *PtrOrErr;
     }
     if (IsAsync) {
       if (auto Err = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
@@ -632,11 +638,20 @@ Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
     bool InOrder = InteropSpec.attrs.inorder;
     Ret->attrs.inorder = InOrder;
     if (useImmForInterop()) {
-      auto CmdList = createImmCmdList(InOrder);
-      Ret->async_info->Queue = CmdList;
-      L0->ImmCmdList = CmdList;
+      auto CmdListOrErr = createImmCmdList(InOrder);
+      if (!CmdListOrErr) {
+        delete Ret;
+        return CmdListOrErr.takeError();
+      }
+      Ret->async_info->Queue = *CmdListOrErr;
+      L0->ImmCmdList = *CmdListOrErr;
     } else {
-      Ret->async_info->Queue = createCommandQueue(InOrder);
+      auto QueueOrErr = createCommandQueue(InOrder);
+      if (!QueueOrErr) {
+        delete Ret;
+        return QueueOrErr.takeError();
+      }
+      Ret->async_info->Queue = *QueueOrErr;
       L0->CommandQueue =
           static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
     }
@@ -676,17 +691,32 @@ Error L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
   ze_command_queue_handle_t CmdQueue = nullptr;
 
   if (useImmForCopy()) {
-    CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    auto CmdListOrErr = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
     CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
                       nullptr, 0, nullptr);
     CALL_ZE_RET_ERROR(zeCommandListHostSynchronize, CmdList, UINT64_MAX);
   } else {
     if (UseCopyEngine) {
-      CmdList = getCopyCmdList();
-      CmdQueue = getCopyCmdQueue();
+      auto CmdListOrErr = getCopyCmdList();
+      if (!CmdListOrErr)
+        return CmdListOrErr.takeError();
+      CmdList = *CmdListOrErr;
+      auto CmdQueueOrErr = getCopyCmdQueue();
+      if (!CmdQueueOrErr)
+        return CmdQueueOrErr.takeError();
+      CmdQueue = *CmdQueueOrErr;
     } else {
-      CmdList = getCmdList();
-      CmdQueue = getCmdQueue();
+      auto CmdListOrErr = getCmdList();
+      if (!CmdListOrErr)
+        return CmdListOrErr.takeError();
+      CmdList = *CmdListOrErr;
+      auto CmdQueueOrErr = getCmdQueue();
+      if (!CmdQueueOrErr)
+        return CmdQueueOrErr.takeError();
+      CmdQueue = *CmdQueueOrErr;
     }
 
     CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
@@ -724,7 +754,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
     else
       NumWaitEvents = 0;
   }
-  auto CmdList = getImmCopyCmdList();
+  auto CmdListOrError = getImmCopyCmdList();
+  if (!CmdListOrError)
+    return CmdListOrError.takeError();
+  const auto CmdList = *CmdListOrError;
   CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
                     SignalEvent, NumWaitEvents, WaitEvents);
   AsyncQueue->WaitEvents.push_back(SignalEvent);
@@ -735,7 +768,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
 Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
                                  size_t PatternSize, size_t Size) {
   if (useImmForCopy()) {
-    const auto CmdList = getImmCopyCmdList();
+    auto CmdListOrErr = getImmCopyCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    const auto CmdList = *CmdListOrErr;
     auto EventOrErr = getEvent();
     if (!EventOrErr)
       return EventOrErr.takeError();
@@ -744,8 +780,14 @@ Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
                       PatternSize, Size, Event, 0, nullptr);
     CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
   } else {
-    auto CmdList = getCopyCmdList();
-    const auto CmdQueue = getCopyCmdQueue();
+    auto CmdListOrErr = getCopyCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    auto CmdList = *CmdListOrErr;
+    auto CmdQueueOrErr = getCopyCmdQueue();
+    if (!CmdQueueOrErr)
+      return CmdQueueOrErr.takeError();
+    const auto CmdQueue = *CmdQueueOrErr;
     CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
                       PatternSize, Size, nullptr, 0, nullptr);
     CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
@@ -800,15 +842,15 @@ Error L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
 
 // Command queues related functions
 /// Create a command list with given ordinal and flags
-ze_command_list_handle_t L0DeviceTy::createCmdList(
+Expected<ze_command_list_handle_t> L0DeviceTy::createCmdList(
     ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
     ze_command_list_flags_t Flags, const std::string_view DeviceIdStr) {
   ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
                                         nullptr, // extension
                                         Ordinal, Flags};
   ze_command_list_handle_t cmdList;
-  CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
-                   &cmdList);
+  CALL_ZE_RET_ERROR(zeCommandListCreate, Context, Device, &cmdListDesc,
+                    &cmdList);
   DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
      ") for device %s.\n",
      DPxPTR(cmdList), Ordinal, DeviceIdStr.data());
@@ -816,7 +858,7 @@ ze_command_list_handle_t L0DeviceTy::createCmdList(
 }
 
 /// Create a command list with default flags
-ze_command_list_handle_t
+Expected<ze_command_list_handle_t>
 L0DeviceTy::createCmdList(ze_context_handle_t Context,
                           ze_device_handle_t Device, uint32_t Ordinal,
                           const std::string_view DeviceIdStr) {
@@ -825,19 +867,22 @@ L0DeviceTy::createCmdList(ze_context_handle_t Context,
              : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
 }
 
-ze_command_list_handle_t L0DeviceTy::getCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::getCmdList() {
   auto &TLS = getTLS();
   auto CmdList = TLS.getCmdList();
   if (!CmdList) {
-    CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
-                            getZeId());
+    auto CmdListOrErr = createCmdList(getZeContext(), getZeDevice(),
+                                      getComputeEngine(), getZeId());
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
     TLS.setCmdList(CmdList);
   }
   return CmdList;
 }
 
 /// Create a command queue with given ordinal and flags
-ze_command_queue_handle_t
+Expected<ze_command_queue_handle_t>
 L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
                            ze_device_handle_t Device, uint32_t Ordinal,
                            uint32_t Index, ze_command_queue_flags_t Flags,
@@ -850,8 +895,8 @@ L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
                                           ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
                                           ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
   ze_command_queue_handle_t cmdQueue;
-  CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
-                   &cmdQueue);
+  CALL_ZE_RET_ERROR(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+                    &cmdQueue);
   DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
      ", Flags: %" PRIu32 ") for device %s.\n",
      DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.data());
@@ -859,7 +904,7 @@ L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
 }
 
 /// Create a command queue with default flags
-ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+Expected<ze_command_queue_handle_t> L0DeviceTy::createCmdQueue(
     ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
     uint32_t Index, const std::string_view DeviceIdStr, bool InOrder) {
   ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
@@ -869,7 +914,8 @@ ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
 }
 
 /// Create a new command queue for the given OpenMP device ID
-ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+Expected<ze_command_queue_handle_t>
+L0DeviceTy::createCommandQueue(bool InOrder) {
   auto cmdQueue =
       createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
                      getComputeIndex(), getZeId(), InOrder);
@@ -877,7 +923,7 @@ ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
 }
 
 /// Create an immediate command list
-ze_command_list_handle_t
+Expected<ze_command_list_handle_t>
 L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
   ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
   ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
@@ -888,8 +934,8 @@ L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
                                ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
                                ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
   ze_command_list_handle_t CmdList = nullptr;
-  CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
-                   &Desc, &CmdList);
+  CALL_ZE_RET_ERROR(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+                    &Desc, &CmdList);
   DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
      ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
      DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
@@ -897,7 +943,7 @@ L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
 }
 
 /// Create an immediate command list for copying
-ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::createImmCopyCmdList() {
   uint32_t Ordinal = getMainCopyEngine();
   if (Ordinal == UINT32_MAX)
     Ordinal = getLinkCopyEngine();
@@ -906,24 +952,30 @@ ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
   return createImmCmdList(Ordinal, /*Index*/ 0);
 }
 
-ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+Expected<ze_command_queue_handle_t> L0DeviceTy::getCmdQueue() {
   auto &TLS = getTLS();
   auto CmdQueue = TLS.getCmdQueue();
   if (!CmdQueue) {
-    CmdQueue = createCommandQueue();
+    auto CmdQueueOrErr = createCommandQueue();
+    if (!CmdQueueOrErr)
+      return CmdQueueOrErr.takeError();
+    CmdQueue = *CmdQueueOrErr;
     TLS.setCmdQueue(CmdQueue);
   }
   return CmdQueue;
 }
 
-ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::getCopyCmdList() {
   // Use main copy engine if available
   if (hasMainCopyEngine()) {
     auto &TLS = getTLS();
     auto CmdList = TLS.getCopyCmdList();
     if (!CmdList) {
-      CmdList = createCmdList(getZeContext(), getZeDevice(),
-                              getMainCopyEngine(), getZeId());
+      auto CmdListOrErr = createCmdList(getZeContext(), getZeDevice(),
+                                        getMainCopyEngine(), getZeId());
+      if (!CmdListOrErr)
+        return CmdListOrErr.takeError();
+      CmdList = *CmdListOrErr;
       TLS.setCopyCmdList(CmdList);
     }
     return CmdList;
@@ -935,14 +987,17 @@ ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
   return getCmdList();
 }
 
-ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+Expected<ze_command_queue_handle_t> L0DeviceTy::getCopyCmdQueue() {
   // Use main copy engine if available
   if (hasMainCopyEngine()) {
     auto &TLS = getTLS();
     auto CmdQueue = TLS.getCopyCmdQueue();
     if (!CmdQueue) {
-      CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
-                                getMainCopyEngine(), 0, getZeId());
+      auto CmdQueueOrErr = createCmdQueue(getZeContext(), getZeDevice(),
+                                          getMainCopyEngine(), 0, getZeId());
+      if (!CmdQueueOrErr)
+        return CmdQueueOrErr.takeError();
+      CmdQueue = *CmdQueueOrErr;
       TLS.setCopyCmdQueue(CmdQueue);
     }
     return CmdQueue;
@@ -954,15 +1009,18 @@ ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
   return getCmdQueue();
 }
 
-ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::getLinkCopyCmdList() {
   // Use link copy engine if available
   if (hasLinkCopyEngine()) {
     auto &TLS = getTLS();
     auto CmdList = TLS.getLinkCopyCmdList();
     if (!CmdList) {
-      CmdList =
+      auto CmdListOrErr =
           createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
                         ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+      if (!CmdListOrErr)
+        return CmdListOrErr.takeError();
+      CmdList = *CmdListOrErr;
       TLS.setLinkCopyCmdList(CmdList);
     }
     return CmdList;
@@ -974,7 +1032,7 @@ ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
   return getCmdList();
 }
 
-ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+Expected<ze_command_queue_handle_t> L0DeviceTy::getLinkCopyCmdQueue() {
   // Use link copy engine if available
   if (hasLinkCopyEngine()) {
     auto &TLS = getTLS();
@@ -983,9 +1041,12 @@ ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
       // Try to use different copy engines for multiple threads
       uint32_t Index =
           __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
-      CmdQueue =
+      auto CmdQueueOrErr =
           createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
                          Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+      if (!CmdQueueOrErr)
+        return CmdQueueOrErr.takeError();
+      CmdQueue = *CmdQueueOrErr;
       TLS.setLinkCopyCmdQueue(CmdQueue);
     }
     return CmdQueue;
@@ -997,21 +1058,27 @@ ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
   return getCmdQueue();
 }
 
-ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::getImmCmdList() {
   auto &TLS = getTLS();
   auto CmdList = TLS.getImmCmdList();
   if (!CmdList) {
-    CmdList = createImmCmdList();
+    auto CmdListOrErr = createImmCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
     TLS.setImmCmdList(CmdList);
   }
   return CmdList;
 }
 
-ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+Expected<ze_command_list_handle_t> L0DeviceTy::getImmCopyCmdList() {
   auto &TLS = getTLS();
   auto CmdList = TLS.getImmCopyCmdList();
   if (!CmdList) {
-    CmdList = createImmCopyCmdList();
+    auto CmdListOrErr = createImmCopyCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
     TLS.setImmCopyCmdList(CmdList);
   }
   return CmdList;
@@ -1029,11 +1096,21 @@ Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
   ze_command_queue_handle_t CmdQueue = nullptr;
 
   if (useImmForCopy()) {
-    CmdList = getImmCopyCmdList();
+    auto CmdListOrErr = getImmCopyCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    auto CmdList = *CmdListOrErr;
     CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
   } else {
-    CmdList = getCopyCmdList();
-    CmdQueue = getCopyCmdQueue();
+    auto CmdListOrErr = getCopyCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    auto CmdQueueOrerr = getCopyCmdQueue();
+    if (!CmdQueueOrerr)
+      return CmdQueueOrerr.takeError();
+
+    CmdList = *CmdListOrErr;
+    CmdQueue = *CmdQueueOrerr;
     CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
     CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
     CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 61f659d995ec0..c04ca74287cd7 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -547,11 +547,20 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   const bool UseImmCmdList = Device.useImmForCompute();
 
   if (UseImmCmdList) {
-    CmdList = Device.getImmCmdList();
+    auto CmdListOrErr = Device.getImmCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
     // Command queue is not used with immediate command list
   } else {
-    CmdList = Device.getCmdList();
-    CmdQueue = Device.getCmdQueue();
+    auto CmdListOrErr = Device.getCmdList();
+    if (!CmdListOrErr)
+      return CmdListOrErr.takeError();
+    CmdList = *CmdListOrErr;
+    auto CmdQueueOrErr = Device.getCmdQueue();
+    if (!CmdQueueOrErr)
+      return CmdQueueOrErr.takeError();
+    CmdQueue = *CmdQueueOrErr;
   }
 
   if (UseImmCmdList) {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 046ac9bdadd86..1ffdaf10ff13a 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -148,7 +148,8 @@ Expected<int32_t> LevelZeroPluginTy::initImpl() {
 
 Error LevelZeroPluginTy::deinitImpl() {
   DP("Deinit Level0 plugin!\n");
-  ContextTLSTable.clear();
+  if (auto Err = ContextTLSTable.deinit())
+    return Err;
   DeviceTLSTable.clear();
   ThreadTLSTable.clear();
   for (auto &Context : ContextList)

>From 19e888e52996a985bcc6228f9dc7776f69922e61 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 00:17:12 +0200
Subject: [PATCH 56/70] fix format

---
 .../level_zero/include/L0Device.h             | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 2a845a28f987d..b6fbd2d4031e4 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -423,40 +423,37 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   // Command queues related functions
   /// Create a command list with given ordinal and flags
-  Expected<ze_command_list_handle_t> createCmdList(ze_context_handle_t Context,
-                                         ze_device_handle_t Device,
-                                         uint32_t Ordinal,
-                                         ze_command_list_flags_t Flags,
-                                         const std::string_view DeviceIdStr);
+  Expected<ze_command_list_handle_t>
+  createCmdList(ze_context_handle_t Context, ze_device_handle_t Device,
+                uint32_t Ordinal, ze_command_list_flags_t Flags,
+                const std::string_view DeviceIdStr);
 
   /// Create a command list with default flags
-  Expected<ze_command_list_handle_t> createCmdList(ze_context_handle_t Context,
-                                         ze_device_handle_t Device,
-                                         uint32_t Ordinal,
-                                         const std::string_view DeviceIdStr);
+  Expected<ze_command_list_handle_t>
+  createCmdList(ze_context_handle_t Context, ze_device_handle_t Device,
+                uint32_t Ordinal, const std::string_view DeviceIdStr);
 
   Expected<ze_command_list_handle_t> getCmdList();
 
   /// Create a command queue with given ordinal and flags
-  Expected<ze_command_queue_handle_t> createCmdQueue(ze_context_handle_t Context,
-                                           ze_device_handle_t Device,
-                                           uint32_t Ordinal, uint32_t Index,
-                                           ze_command_queue_flags_t Flags,
-                                           const std::string_view DeviceIdStr);
+  Expected<ze_command_queue_handle_t>
+  createCmdQueue(ze_context_handle_t Context, ze_device_handle_t Device,
+                 uint32_t Ordinal, uint32_t Index,
+                 ze_command_queue_flags_t Flags,
+                 const std::string_view DeviceIdStr);
 
   /// Create a command queue with default flags
-  Expected<ze_command_queue_handle_t> createCmdQueue(ze_context_handle_t Context,
-                                           ze_device_handle_t Device,
-                                           uint32_t Ordinal, uint32_t Index,
-                                           const std::string_view DeviceIdStr,
-                                           bool InOrder = false);
+  Expected<ze_command_queue_handle_t>
+  createCmdQueue(ze_context_handle_t Context, ze_device_handle_t Device,
+                 uint32_t Ordinal, uint32_t Index,
+                 const std::string_view DeviceIdStr, bool InOrder = false);
 
   /// Create a new command queue for the given OpenMP device ID
   Expected<ze_command_queue_handle_t> createCommandQueue(bool InOrder = false);
 
   /// Create an immediate command list
-  Expected<ze_command_list_handle_t> createImmCmdList(uint32_t Ordinal, uint32_t Index,
-                                            bool InOrder = false);
+  Expected<ze_command_list_handle_t>
+  createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder = false);
 
   /// Create an immediate command list for computing
   Expected<ze_command_list_handle_t> createImmCmdList(bool InOrder = false) {

>From a591b0e63b6dbc3e3216a97fb20e03d209b80238 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 09:51:20 +0200
Subject: [PATCH 57/70] Move DeviceRange to PluginInterface and remove internal
 Device list

---
 .../common/include/PluginInterface.h          | 19 ++++-
 .../level_zero/include/L0Plugin.h             | 43 +++-------
 .../level_zero/src/L0Plugin.cpp               | 80 ++++++++-----------
 3 files changed, 64 insertions(+), 78 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c530bba3882c..ed37b8c166bc9 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1266,7 +1266,7 @@ struct GenericPluginTy {
   virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
 
   /// Get the reference to the device with a certain device id.
-  GenericDeviceTy &getDevice(int32_t DeviceId) {
+  GenericDeviceTy &getDevice(int32_t DeviceId) const {
     assert(isValidDeviceId(DeviceId) && "Invalid device id");
     assert(Devices[DeviceId] && "Device is uninitialized");
 
@@ -1527,6 +1527,23 @@ struct GenericPluginTy {
   /// object and return immediately.
   int32_t async_barrier(omp_interop_val_t *Interop);
 
+  struct DevicesRangeTy {
+    using iterator = llvm::SmallVector<GenericDeviceTy *>::iterator;
+
+    iterator BeginIt;
+    iterator EndIt;
+
+    DevicesRangeTy(iterator BeginIt, iterator EndIt)
+        : BeginIt(BeginIt), EndIt(EndIt) {}
+
+    auto &begin() { return BeginIt; }
+    auto &end() { return EndIt; }
+  };
+
+  DevicesRangeTy getDevicesRange() {
+    return DevicesRangeTy(Devices.begin(), Devices.end());
+  }
+
 private:
   /// Indicates if the platform runtime has been fully initialized.
   bool Initialized = false;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index f718486035f13..33737dd58ea74 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -26,16 +26,16 @@ namespace llvm::omp::target::plugin {
 /// Class implementing the LevelZero specific functionalities of the plugin.
 class LevelZeroPluginTy final : public GenericPluginTy {
 private:
-  /// Number of devices available including subdevices
-  uint32_t NumDevices = 0;
+  struct DeviceInfoTy {
+    L0DeviceIdTy Id;
+    L0ContextTy *Driver;
+    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+  };
+  llvm::SmallVector<DeviceInfoTy> DetectedDevices;
 
   /// Context (and Driver) specific data
   std::list<L0ContextTy> ContextList;
 
-  /// L0 device used by each OpenMP device
-  using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
-  DeviceContainerTy L0Devices;
-
   // Table containing per-thread information using TLS
   L0ThreadTblTy ThreadTLSTable;
   // Table containing per-thread information for each device using TLS
@@ -51,6 +51,10 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 
   auto &getTLS() { return ThreadTLSTable.get(); }
 
+  /// Find L0 devices and initialize device properties.
+  /// Returns number of devices reported to omptarget.
+  Expected<int32_t> findDevices();
+
 public:
   LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
   virtual ~LevelZeroPluginTy() {}
@@ -62,35 +66,10 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 
   static const auto &getOptions() { return Options; }
 
-  struct DevicesRangeTy {
-    using iterator = DeviceContainerTy::iterator;
-
-    iterator BeginIt;
-    iterator EndIt;
-
-    DevicesRangeTy(iterator BeginIt, iterator EndIt)
-        : BeginIt(BeginIt), EndIt(EndIt) {}
-
-    auto &begin() { return BeginIt; }
-    auto &end() { return EndIt; }
-  };
-
-  auto getDevicesRange() {
-    return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
-  }
-
-  /// Find L0 devices and initialize device properties.
-  /// Returns number of devices reported to omptarget.
-  Expected<int32_t> findDevices();
-
   L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
-    assert("Invalid device ID" && DeviceId >= 0 &&
-           DeviceId < static_cast<int32_t>(L0Devices.size()));
-    return *L0Devices[DeviceId];
+    return static_cast<L0DeviceTy &>(getDevice(DeviceId));
   }
 
-  uint32_t getNumRootDevices() const { return NumDevices; }
-
   AsyncQueueTy *getAsyncQueue() {
     auto *Queue = getTLS().getAsyncQueue();
     if (!Queue)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 1ffdaf10ff13a..68fc4b74d77ef 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -86,57 +86,29 @@ Expected<int32_t> LevelZeroPluginTy::findDevices() {
               return A.IsDiscrete;
             });
 
-  struct DeviceInfoTy {
-    L0DeviceIdTy Id;
-    L0ContextTy *Driver;
-    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
-  };
-
-  llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
-
   for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
     const auto zeDevice = RootDevices[RootId].zeDevice;
     auto *RootDriver = RootDevices[RootId].Driver;
-    DevicesToAdd.push_back(
-        {{zeDevice, static_cast<int32_t>(RootId), -1, -1}, RootDriver});
-  }
-  NumDevices = DevicesToAdd.size();
-  auto DeviceId = 0;
-  for (auto &DeviceInfo : DevicesToAdd) {
-    auto RootId = DeviceInfo.Id.RootId;
-    auto SubId = DeviceInfo.Id.SubId;
-    auto CCSId = DeviceInfo.Id.CCSId;
-    auto zeDevice = DeviceInfo.Id.zeId;
-    auto *Driver = DeviceInfo.Driver;
-
-    std::string IdStr = std::to_string(RootId) +
-                        (SubId < 0 ? "" : "." + std::to_string(SubId)) +
-                        (CCSId < 0 ? "" : "." + std::to_string(CCSId));
-
-    L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
-                                       zeDevice, *Driver, std::move(IdStr),
-                                       CCSId < 0 ? 0 : CCSId /* ComputeIndex */
-                                       ));
-    DeviceId++;
+    DetectedDevices.push_back(DeviceInfoTy{
+        {zeDevice, static_cast<int32_t>(RootId), -1, -1}, RootDriver});
   }
+  int32_t NumDevices = DetectedDevices.size();
 
-  DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
-     getNumRootDevices(), NumDevices);
+  DP("Found %" PRIu32 " devices.\n", NumDevices);
   DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
-  for (auto &l0Device : L0Devices) {
-    DP("-- %s\n", l0Device->getZeIdCStr());
-    (void)l0Device; // silence warning
+  for (auto &DeviceInfo : DetectedDevices) {
+    (void)DeviceInfo; // to avoid unused variable warning in non-debug builds
+    DP("-- Device %" PRIu32 "%s%s (zeDevice=%p) from Driver %p\n",
+       DeviceInfo.Id.RootId,
+       (DeviceInfo.Id.SubId < 0
+            ? ""
+            : ("." + std::to_string(DeviceInfo.Id.SubId)).c_str()),
+       (DeviceInfo.Id.CCSId < 0
+            ? ""
+            : ("." + std::to_string(DeviceInfo.Id.CCSId)).c_str()),
+       DPxPTR(DeviceInfo.Id.zeId), DPxPTR(DeviceInfo.Id.Driver));
   }
-
-  if (getDebugLevel() > 0) {
-    DP("Root Device Information\n");
-    for (uint32_t I = 0; I < getNumRootDevices(); I++) {
-      auto &l0Device = getDeviceFromId(I);
-      l0Device.reportDeviceInfo();
-    }
-  }
-
-  return getNumRootDevices();
+  return NumDevices;
 }
 
 Expected<int32_t> LevelZeroPluginTy::initImpl() {
@@ -163,7 +135,25 @@ Error LevelZeroPluginTy::deinitImpl() {
 GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
                                                  int32_t DeviceId,
                                                  int32_t NumDevices) {
-  return &getDeviceFromId(DeviceId);
+  auto &DeviceInfo = DetectedDevices[DeviceId];
+  auto RootId = DeviceInfo.Id.RootId;
+  auto SubId = DeviceInfo.Id.SubId;
+  auto CCSId = DeviceInfo.Id.CCSId;
+  auto zeDevice = DeviceInfo.Id.zeId;
+  auto *zeDriver = DeviceInfo.Driver;
+
+  std::string IdStr = std::to_string(RootId) +
+                      (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+                      (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+  auto *NewDevice = new L0DeviceTy(
+      static_cast<LevelZeroPluginTy &>(Plugin), DeviceId, NumDevices, zeDevice,
+      *zeDriver, std::move(IdStr), CCSId < 0 ? 0 : CCSId /* ComputeIndex */);
+  if (NewDevice && getDebugLevel() > 0) {
+    DP("Device %" PRIi32 " information\n", DeviceId);
+    NewDevice->reportDeviceInfo();
+  }
+  return NewDevice;
 }
 
 GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {

>From fc61165a14785d850d020c3fe3a6463408a79db5 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 10:25:50 +0200
Subject: [PATCH 58/70] refactor MemPool options field

---
 .../level_zero/include/L0Options.h            | 15 +++++---
 .../level_zero/src/L0Memory.cpp               | 12 +++----
 .../level_zero/src/L0Options.cpp              | 36 +++++++++++--------
 3 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 459eef312f076..176a70554a6d6 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -77,13 +77,18 @@ struct L0OptionsTy {
   /// Staging buffer count
   size_t StagingBufferCount = L0StagingBufferCount;
 
-  // TODO: This should probably be an array indexed by AllocKind
   /// Memory pool parameters
   /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
-  std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
-      {TARGET_ALLOC_DEVICE, {1, 4, 256}},
-      {TARGET_ALLOC_HOST, {1, 4, 256}},
-      {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+  struct MemPoolConfigTy {
+    bool Use;
+    int32_t AllocMax;
+    int32_t Capacity;
+    int32_t PoolSize;
+  };
+  std::array<MemPoolConfigTy, 3> MemPoolConfig{
+      MemPoolConfigTy{true, 1, 4, 256},  // TARGET_ALLOC_DEVICE
+      MemPoolConfigTy{true, 1, 4, 256},  // TARGET_ALLOC_HOST
+      MemPoolConfigTy{true, 8, 4, 256}}; // TARGET_ALLOC_SHARED
 
   /// Parameters for memory pools dedicated to reduction scratch space
   std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index dd26ed55a86fa..b1a01c5214f38 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -69,10 +69,10 @@ Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn,
   Allocator = AllocatorIn;
 
   // Read user-defined options
-  const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
-  const size_t UserAllocMax = UserOptions[0];
-  const size_t UserCapacity = UserOptions[1];
-  const size_t UserPoolSize = UserOptions[2];
+  const auto &UserOptions = Option.MemPoolConfig[AllocKind];
+  const size_t UserAllocMax = UserOptions.AllocMax;
+  const size_t UserCapacity = UserOptions.Capacity;
+  const size_t UserPoolSize = UserOptions.PoolSize;
 
   BlockCapacity = UserCapacity;
   PoolSizeMax = UserPoolSize << 20; // MB to B
@@ -373,7 +373,7 @@ Error MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
   Device = &L0Device;
   L0Context = &L0Device.getL0Context();
   for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
-    if (Options.MemPoolInfo.count(Kind) > 0) {
+    if (Options.MemPoolConfig[Kind].Use) {
       std::lock_guard<std::mutex> Lock(Mtx);
       Pools[Kind] = std::make_unique<MemPoolTy>();
       if (auto Err = Pools[Kind]->init(Kind, this, Options))
@@ -395,7 +395,7 @@ Error MemAllocatorTy::initHostPool(L0ContextTy &Driver,
   SupportsLargeMem = Driver.supportsLargeMem();
   IsHostMem = true;
   this->L0Context = &Driver;
-  if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+  if (Option.MemPoolConfig[TARGET_ALLOC_HOST].Use) {
     std::lock_guard<std::mutex> Lock(Mtx);
     Pools[TARGET_ALLOC_HOST] = std::make_unique<MemPoolTy>();
     if (auto Err =
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 2e2c2cd5a5bbf..a5029e806c94b 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -41,7 +41,8 @@ void L0OptionsTy::processEnvironmentVars() {
   if (MemoryPoolVar.isPresent()) {
     if (MemoryPoolVar.get() == "0") {
       Flags.UseMemoryPool = 0;
-      MemPoolInfo.clear();
+      std::for_each(MemPoolConfig.begin(), MemPoolConfig.end(),
+                    [](auto &I) { I = {false, 0, 0, 0}; });
     } else {
       std::istringstream Str(MemoryPoolVar.get());
       int32_t MemType = -1;
@@ -50,19 +51,19 @@ void L0OptionsTy::processEnvironmentVars() {
       const std::array<int32_t, 3> DefaultValue{1, 4, 256};
       const int32_t AllMemType = INT32_MAX;
       std::array<int32_t, 3> AllInfo{1, 4, 256};
-      std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+      std::array<std::array<int32_t, 3>, 3> PoolInfo;
       for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
         if (Token == "device") {
           MemType = TARGET_ALLOC_DEVICE;
-          PoolInfo.emplace(MemType, DefaultValue);
+          PoolInfo[TARGET_ALLOC_DEVICE] = DefaultValue;
           Offset = 0;
         } else if (Token == "host") {
           MemType = TARGET_ALLOC_HOST;
-          PoolInfo.emplace(MemType, DefaultValue);
+          PoolInfo[TARGET_ALLOC_HOST] = DefaultValue;
           Offset = 0;
         } else if (Token == "shared") {
           MemType = TARGET_ALLOC_SHARED;
-          PoolInfo.emplace(MemType, DefaultValue);
+          PoolInfo[TARGET_ALLOC_SHARED] = DefaultValue;
           Offset = 0;
         } else if (Token == "all") {
           MemType = AllMemType;
@@ -87,19 +88,24 @@ void L0OptionsTy::processEnvironmentVars() {
         if (Valid == 2) {
           // "all" is specified -- ignore other inputs
           if (AllInfo[0] > 0) {
-            MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
-            MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
-            MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+            MemPoolConfig[TARGET_ALLOC_DEVICE] = {true, AllInfo[0], AllInfo[1],
+                                                  AllInfo[2]};
+            MemPoolConfig[TARGET_ALLOC_HOST] = {true, AllInfo[0], AllInfo[1],
+                                                AllInfo[2]};
+            MemPoolConfig[TARGET_ALLOC_SHARED] = {true, AllInfo[0], AllInfo[1],
+                                                  AllInfo[2]};
           } else {
-            MemPoolInfo.clear();
+            std::for_each(MemPoolConfig.begin(), MemPoolConfig.end(),
+                          [](auto &I) { I = {false, 0, 0, 0}; });
           }
         } else {
-          // Use user-specified configuration
-          for (auto &I : PoolInfo) {
-            if (I.second[0] > 0)
-              MemPoolInfo[I.first] = I.second;
-            else
-              MemPoolInfo.erase(I.first);
+          for (size_t Pool = 0; Pool < PoolInfo.size(); ++Pool) {
+            if (PoolInfo[Pool][0] == 0) {
+              MemPoolConfig[Pool] = {false, 0, 0, 0};
+            } else {
+              MemPoolConfig[Pool] = {true, PoolInfo[Pool][0], PoolInfo[Pool][1],
+                                     PoolInfo[Pool][2]};
+            }
           }
         }
       } else {

>From 40f26260888853a147e93827f7cb50ee5bc02f24 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 10:42:51 +0200
Subject: [PATCH 59/70] change auto returns

---
 .../level_zero/include/L0Context.h            | 10 +--
 .../level_zero/include/L0Device.h             | 67 +++++++++++--------
 .../level_zero/include/L0Kernel.h             |  4 +-
 .../level_zero/include/L0Plugin.h             | 10 +--
 4 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index 8779c703659c4..20935dac63a44 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -24,8 +24,8 @@ class L0ContextTLSTy {
   StagingBufferTy StagingBuffer;
 
 public:
-  auto &getStagingBuffer() { return StagingBuffer; }
-  const auto &getStagingBuffer() const { return StagingBuffer; }
+  StagingBufferTy &getStagingBuffer() { return StagingBuffer; }
+  const StagingBufferTy &getStagingBuffer() const { return StagingBuffer; }
 
   Error deinit() { return StagingBuffer.clear(); }
 };
@@ -86,7 +86,7 @@ class L0ContextTy {
   Error init();
   Error deinit();
 
-  auto &getPlugin() const { return Plugin; }
+  LevelZeroPluginTy &getPlugin() const { return Plugin; }
 
   StagingBufferTy &getStagingBuffer();
 
@@ -124,8 +124,8 @@ class L0ContextTy {
   ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
 
   /// Return the event pool of this driver
-  auto &getEventPool() { return EventPool; }
-  const auto &getEventPool() const { return EventPool; }
+  EventPoolTy &getEventPool() { return EventPool; }
+  const EventPoolTy &getEventPool() const { return EventPool; }
 
   bool supportsLargeMem() const {
     // Large memory support is available since API version 1.1
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index b6fbd2d4031e4..a57aa55ba5608 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -156,40 +156,44 @@ class L0DeviceTLSTy {
   L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
   L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
 
-  auto getCmdList() const { return CmdList; }
+  ze_command_list_handle_t getCmdList() const { return CmdList; }
   void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
 
-  auto getCopyCmdList() const { return CopyCmdList; }
+  ze_command_list_handle_t getCopyCmdList() const { return CopyCmdList; }
   void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
     CopyCmdList = _CopyCmdList;
   }
 
-  auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+  ze_command_list_handle_t getLinkCopyCmdList() const {
+    return LinkCopyCmdList;
+  }
   void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
     LinkCopyCmdList = _LinkCopyCmdList;
   }
 
-  auto getImmCmdList() const { return ImmCmdList; }
+  ze_command_list_handle_t getImmCmdList() const { return ImmCmdList; }
   void setImmCmdList(ze_command_list_handle_t ImmCmdListIn) {
     ImmCmdList = ImmCmdListIn;
   }
 
-  auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+  ze_command_list_handle_t getImmCopyCmdList() const { return ImmCopyCmdList; }
   void setImmCopyCmdList(ze_command_list_handle_t ImmCopyCmdListIn) {
     ImmCopyCmdList = ImmCopyCmdListIn;
   }
 
-  auto getCmdQueue() const { return CmdQueue; }
+  ze_command_queue_handle_t getCmdQueue() const { return CmdQueue; }
   void setCmdQueue(ze_command_queue_handle_t CmdQueueIn) {
     CmdQueue = CmdQueueIn;
   }
 
-  auto getCopyCmdQueue() const { return CopyCmdQueue; }
+  ze_command_queue_handle_t getCopyCmdQueue() const { return CopyCmdQueue; }
   void setCopyCmdQueue(ze_command_queue_handle_t CopyCmdQueueIn) {
     CopyCmdQueue = CopyCmdQueueIn;
   }
 
-  auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+  ze_command_queue_handle_t getLinkCopyCmdQueue() const {
+    return LinkCopyCmdQueue;
+  }
   void setLinkCopyCmdQueue(ze_command_queue_handle_t LinkCopyCmdQueueIn) {
     LinkCopyCmdQueue = LinkCopyCmdQueueIn;
   }
@@ -288,13 +292,13 @@ class L0DeviceTy final : public GenericDeviceTy {
     return static_cast<L0DeviceTy &>(Device);
   }
 
-  auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+  LevelZeroPluginTy &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
   L0DeviceTLSTy &getTLS();
 
   Error setContext() override { return Plugin::success(); }
   Error initImpl(GenericPluginTy &Plugin) override;
   Error deinitImpl() override;
-  auto getZeDevice() const { return zeDevice; }
+  ze_device_handle_t getZeDevice() const { return zeDevice; }
 
   const L0ContextTy &getL0Context() const { return l0Context; }
   L0ContextTy &getL0Context() { return l0Context; }
@@ -307,14 +311,16 @@ class L0DeviceTy final : public GenericDeviceTy {
 
   std::mutex &getMutex() { return Mutex; }
 
-  auto getComputeIndex() const { return ComputeIndex; }
-  auto getIndirectFlags() const { return IndirectAccessFlags; }
+  uint32_t getComputeIndex() const { return ComputeIndex; }
+  ze_kernel_indirect_access_flags_t getIndirectFlags() const {
+    return IndirectAccessFlags;
+  }
 
-  auto getNumGlobalModules() const { return GlobalModules.size(); }
+  size_t getNumGlobalModules() const { return GlobalModules.size(); }
   void addGlobalModule(ze_module_handle_t Module) {
     GlobalModules.push_back(Module);
   }
-  auto getGlobalModulesArray() { return GlobalModules.data(); }
+  ze_module_handle_t *getGlobalModulesArray() { return GlobalModules.data(); }
 
   L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) {
     for (auto &PGM : Programs)
@@ -332,42 +338,45 @@ class L0DeviceTy final : public GenericDeviceTy {
   }
 
   // add a new program to the device. Return a reference to the new program
-  auto &addProgram(int32_t ImageId, std::unique_ptr<MemoryBuffer> &&Image) {
+  L0ProgramTy &addProgram(int32_t ImageId,
+                          std::unique_ptr<MemoryBuffer> &&Image) {
     Programs.emplace_back(ImageId, *this, std::move(Image));
     return Programs.back();
   }
 
-  const auto &getLastProgram() const { return Programs.back(); }
-  auto &getLastProgram() { return Programs.back(); }
+  const L0ProgramTy &getLastProgram() const { return Programs.back(); }
+  L0ProgramTy &getLastProgram() { return Programs.back(); }
   // Device properties getters
-  auto getVendorId() const { return DeviceProperties.vendorId; }
+  uint32_t getVendorId() const { return DeviceProperties.vendorId; }
   bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
 
-  auto getPCIId() const { return DeviceProperties.deviceId; }
-  auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
-  auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
-  auto getNumEUsPerSubslice() const {
+  uint32_t getPCIId() const { return DeviceProperties.deviceId; }
+  uint32_t getNumThreadsPerEU() const {
+    return DeviceProperties.numThreadsPerEU;
+  }
+  uint32_t getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+  uint32_t getNumEUsPerSubslice() const {
     return DeviceProperties.numEUsPerSubslice;
   }
-  auto getNumSubslicesPerSlice() const {
+  uint32_t getNumSubslicesPerSlice() const {
     return DeviceProperties.numSubslicesPerSlice;
   }
-  auto getNumSlices() const { return DeviceProperties.numSlices; }
-  auto getNumSubslices() const {
+  uint32_t getNumSlices() const { return DeviceProperties.numSlices; }
+  uint32_t getNumSubslices() const {
     return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
   }
   uint32_t getNumEUs() const {
     return DeviceProperties.numEUsPerSubslice * getNumSubslices();
   }
-  auto getTotalThreads() const {
+  uint32_t getTotalThreads() const {
     return DeviceProperties.numThreadsPerEU * getNumEUs();
   }
-  auto getNumThreadsPerSubslice() const {
+  uint32_t getNumThreadsPerSubslice() const {
     return getNumEUsPerSubslice() * getNumThreadsPerEU();
   }
-  auto getClockRate() const { return DeviceProperties.coreClockRate; }
+  uint32_t getClockRate() const { return DeviceProperties.coreClockRate; }
 
-  auto getMaxSharedLocalMemory() const {
+  uint32_t getMaxSharedLocalMemory() const {
     return ComputeProperties.maxSharedLocalMemory;
   }
   auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index ceabe88c618ef..e07ae81c0ff7a 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -108,7 +108,7 @@ class L0KernelTy : public GenericKernelTy {
   ze_kernel_handle_t zeKernel;
   // Kernel Properties
   KernelPropertiesTy Properties;
-  auto &getProperties() { return Properties; }
+  KernelPropertiesTy &getProperties() { return Properties; }
 
   Error runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
                             KernelLaunchParamsTy LaunchParams,
@@ -140,7 +140,7 @@ class L0KernelTy : public GenericKernelTy {
   L0KernelTy &operator=(const L0KernelTy &) = delete;
   L0KernelTy &operator=(const L0KernelTy &&) = delete;
 
-  const auto &getProperties() const { return Properties; }
+  const KernelPropertiesTy &getProperties() const { return Properties; }
 
   /// Initialize the L0 kernel.
   Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 33737dd58ea74..d6a3a974c1e9c 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -49,7 +49,7 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   /// Common pool of AsyncQueue
   AsyncQueuePoolTy AsyncQueuePool;
 
-  auto &getTLS() { return ThreadTLSTable.get(); }
+  L0ThreadTLSTy &getTLS() { return ThreadTLSTable.get(); }
 
   /// Find L0 devices and initialize device properties.
   /// Returns number of devices reported to omptarget.
@@ -59,12 +59,14 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
   virtual ~LevelZeroPluginTy() {}
 
-  auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
-  auto &getContextTLS(ze_context_handle_t Context) {
+  L0DeviceTLSTy &getDeviceTLS(int32_t DeviceId) {
+    return DeviceTLSTable.get(DeviceId);
+  }
+  L0ContextTLSTy &getContextTLS(ze_context_handle_t Context) {
     return ContextTLSTable.get(Context);
   }
 
-  static const auto &getOptions() { return Options; }
+  static const L0OptionsTy &getOptions() { return Options; }
 
   L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
     return static_cast<L0DeviceTy &>(getDevice(DeviceId));

>From f04fef72abfabdcd7bbbbab0c47a8fcbc48f3b37 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 22 Oct 2025 15:21:16 +0200
Subject: [PATCH 60/70] Remove link copy queues

---
 .../level_zero/include/L0Device.h             | 41 +------------
 .../level_zero/src/L0Device.cpp               | 58 -------------------
 2 files changed, 2 insertions(+), 97 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index a57aa55ba5608..b7e5d0215b357 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -84,18 +84,12 @@ class L0DeviceTLSTy {
   /// Main copy command list for each device
   ze_command_list_handle_t CopyCmdList = nullptr;
 
-  /// Link copy command list for each device
-  ze_command_list_handle_t LinkCopyCmdList = nullptr;
-
   /// Command queue for each device
   ze_command_queue_handle_t CmdQueue = nullptr;
 
   /// Main copy command queue for each device
   ze_command_queue_handle_t CopyCmdQueue = nullptr;
 
-  /// Link copy command queues for each device
-  ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
-
   /// Immediate command list for each device
   ze_command_list_handle_t ImmCmdList = nullptr;
 
@@ -106,9 +100,8 @@ class L0DeviceTLSTy {
   L0DeviceTLSTy() = default;
   ~L0DeviceTLSTy() {
     // assert all fields are nullptr on destruction
-    assert(!CmdList && !CopyCmdList && !LinkCopyCmdList && !CmdQueue &&
-           !CopyCmdQueue && !LinkCopyCmdQueue && !ImmCmdList &&
-           !ImmCopyCmdList &&
+    assert(!CmdList && !CopyCmdList && !CmdQueue && !CopyCmdQueue &&
+           !ImmCmdList && !ImmCopyCmdList &&
            "L0DeviceTLSTy destroyed without clearing resources");
   }
 
@@ -116,10 +109,8 @@ class L0DeviceTLSTy {
   L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
     CmdList = std::exchange(Other.CmdList, nullptr);
     CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
-    LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
     CmdQueue = std::exchange(Other.CmdQueue, nullptr);
     CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
-    LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
     ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
     ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
   }
@@ -130,8 +121,6 @@ class L0DeviceTLSTy {
       CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
     if (CopyCmdList)
       CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
-    if (LinkCopyCmdList)
-      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
     if (ImmCmdList)
       CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
     if (ImmCopyCmdList)
@@ -140,15 +129,11 @@ class L0DeviceTLSTy {
       CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
     if (CopyCmdQueue)
       CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
-    if (LinkCopyCmdQueue)
-      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
 
     CmdList = nullptr;
     CopyCmdList = nullptr;
-    LinkCopyCmdList = nullptr;
     CmdQueue = nullptr;
     CopyCmdQueue = nullptr;
-    LinkCopyCmdQueue = nullptr;
     ImmCmdList = nullptr;
     ImmCopyCmdList = nullptr;
   }
@@ -164,13 +149,6 @@ class L0DeviceTLSTy {
     CopyCmdList = _CopyCmdList;
   }
 
-  ze_command_list_handle_t getLinkCopyCmdList() const {
-    return LinkCopyCmdList;
-  }
-  void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
-    LinkCopyCmdList = _LinkCopyCmdList;
-  }
-
   ze_command_list_handle_t getImmCmdList() const { return ImmCmdList; }
   void setImmCmdList(ze_command_list_handle_t ImmCmdListIn) {
     ImmCmdList = ImmCmdListIn;
@@ -190,13 +168,6 @@ class L0DeviceTLSTy {
   void setCopyCmdQueue(ze_command_queue_handle_t CopyCmdQueueIn) {
     CopyCmdQueue = CopyCmdQueueIn;
   }
-
-  ze_command_queue_handle_t getLinkCopyCmdQueue() const {
-    return LinkCopyCmdQueue;
-  }
-  void setLinkCopyCmdQueue(ze_command_queue_handle_t LinkCopyCmdQueueIn) {
-    LinkCopyCmdQueue = LinkCopyCmdQueueIn;
-  }
 };
 
 struct L0DeviceTLSTableTy
@@ -238,8 +209,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
   /// Command queue group ordinals for copying
   std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
-  /// Command queue group ordinals and number of queues for link copy engines
-  std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
 
   /// Command queue index for each device
   uint32_t ComputeIndex = 0;
@@ -416,10 +385,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
   uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
 
-  uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
-  uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
-  bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
-
   bool deviceRequiresImmCmdList() const {
     return isDeviceIPorNewer(0x05004000);
   }
@@ -474,8 +439,6 @@ class L0DeviceTy final : public GenericDeviceTy {
   Expected<ze_command_queue_handle_t> getCmdQueue();
   Expected<ze_command_list_handle_t> getCopyCmdList();
   Expected<ze_command_queue_handle_t> getCopyCmdQueue();
-  Expected<ze_command_list_handle_t> getLinkCopyCmdList();
-  Expected<ze_command_queue_handle_t> getLinkCopyCmdQueue();
   Expected<ze_command_list_handle_t> getImmCmdList();
   Expected<ze_command_list_handle_t> getImmCopyCmdList();
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 369732682ff68..19f55c9a55aed 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -211,7 +211,6 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
 
   CopyOrdinal = findCopyOrdinal();
 
-  LinkCopyOrdinal = findCopyOrdinal(true);
   IsAsyncEnabled =
       isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
   if (auto Err = MemAllocator.initDevicePools(*this, getPlugin().getOptions()))
@@ -945,8 +944,6 @@ L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
 /// Create an immediate command list for copying
 Expected<ze_command_list_handle_t> L0DeviceTy::createImmCopyCmdList() {
   uint32_t Ordinal = getMainCopyEngine();
-  if (Ordinal == UINT32_MAX)
-    Ordinal = getLinkCopyEngine();
   if (Ordinal == UINT32_MAX)
     Ordinal = getComputeEngine();
   return createImmCmdList(Ordinal, /*Index*/ 0);
@@ -980,9 +977,6 @@ Expected<ze_command_list_handle_t> L0DeviceTy::getCopyCmdList() {
     }
     return CmdList;
   }
-  // Use link copy engine if available
-  if (hasLinkCopyEngine())
-    return getLinkCopyCmdList();
   // Use compute engine otherwise
   return getCmdList();
 }
@@ -1002,58 +996,6 @@ Expected<ze_command_queue_handle_t> L0DeviceTy::getCopyCmdQueue() {
     }
     return CmdQueue;
   }
-  // Use link copy engine if available
-  if (hasLinkCopyEngine())
-    return getLinkCopyCmdQueue();
-  // Use compute engine otherwise
-  return getCmdQueue();
-}
-
-Expected<ze_command_list_handle_t> L0DeviceTy::getLinkCopyCmdList() {
-  // Use link copy engine if available
-  if (hasLinkCopyEngine()) {
-    auto &TLS = getTLS();
-    auto CmdList = TLS.getLinkCopyCmdList();
-    if (!CmdList) {
-      auto CmdListOrErr =
-          createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
-                        ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
-      if (!CmdListOrErr)
-        return CmdListOrErr.takeError();
-      CmdList = *CmdListOrErr;
-      TLS.setLinkCopyCmdList(CmdList);
-    }
-    return CmdList;
-  }
-  // Use main copy engine if available
-  if (hasMainCopyEngine())
-    return getCopyCmdList();
-  // Use compute engine otherwise
-  return getCmdList();
-}
-
-Expected<ze_command_queue_handle_t> L0DeviceTy::getLinkCopyCmdQueue() {
-  // Use link copy engine if available
-  if (hasLinkCopyEngine()) {
-    auto &TLS = getTLS();
-    auto CmdQueue = TLS.getLinkCopyCmdQueue();
-    if (!CmdQueue) {
-      // Try to use different copy engines for multiple threads
-      uint32_t Index =
-          __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
-      auto CmdQueueOrErr =
-          createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
-                         Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
-      if (!CmdQueueOrErr)
-        return CmdQueueOrErr.takeError();
-      CmdQueue = *CmdQueueOrErr;
-      TLS.setLinkCopyCmdQueue(CmdQueue);
-    }
-    return CmdQueue;
-  }
-  // Use main copy engine if available
-  if (hasMainCopyEngine())
-    return getCopyCmdQueue();
   // Use compute engine otherwise
   return getCmdQueue();
 }

>From 4a6b196cf80c8ed60fcd774d32bf992676288717 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 23 Oct 2025 22:23:05 +0200
Subject: [PATCH 61/70] debug build fixes

---
 offload/plugins-nextgen/level_zero/src/L0Memory.cpp | 7 ++++---
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index b1a01c5214f38..be501eb1cabd1 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -16,7 +16,7 @@
 
 namespace llvm::omp::target::plugin {
 
-#if LIBOMPTARGET_DEBUG
+#if OMPTARGET_DEBUG
 static const char *AllocKindToStr(int32_t Kind) {
   switch (Kind) {
   case TARGET_ALLOC_DEVICE:
@@ -455,8 +455,9 @@ Error MemAllocatorTy::deinit() {
   }
   // Report memory usage if requested
   if (getDebugLevel() > 0) {
-    for (auto &Stat : Stats) {
-      DP("Memory usage for %s, device " DPxMOD "\n", AllocKindToStr(Stat.first),
+    for (size_t Kind = 0; Kind < MaxMemKind; Kind++) {
+      auto &Stat = Stats[Kind];
+      DP("Memory usage for %s, device " DPxMOD "\n", AllocKindToStr(Kind),
          DPxPTR(Device));
       if (Stat.NumAllocs[0] == 0 && Stat.NumAllocs[1] == 0) {
         DP("-- Not used\n");
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 68fc4b74d77ef..eb1e81e17300c 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -98,7 +98,7 @@ Expected<int32_t> LevelZeroPluginTy::findDevices() {
   DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
   for (auto &DeviceInfo : DetectedDevices) {
     (void)DeviceInfo; // to avoid unused variable warning in non-debug builds
-    DP("-- Device %" PRIu32 "%s%s (zeDevice=%p) from Driver %p\n",
+    DP("-- Device %" PRIu32 "%s%s (zeDevice=" PRIu64 ") from Driver %p\n",
        DeviceInfo.Id.RootId,
        (DeviceInfo.Id.SubId < 0
             ? ""
@@ -106,7 +106,7 @@ Expected<int32_t> LevelZeroPluginTy::findDevices() {
        (DeviceInfo.Id.CCSId < 0
             ? ""
             : ("." + std::to_string(DeviceInfo.Id.CCSId)).c_str()),
-       DPxPTR(DeviceInfo.Id.zeId), DPxPTR(DeviceInfo.Id.Driver));
+       DPxPTR(DeviceInfo.Id.zeId), DPxPTR(DeviceInfo.Driver));
   }
   return NumDevices;
 }

>From 120e3d44df5fd6b0907b883e1df8a27f899a2d82 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 27 Oct 2025 17:24:39 +0100
Subject: [PATCH 62/70] fixes and cleanup

---
 .../plugins-nextgen/level_zero/include/L0Device.h    |  8 ++++----
 .../plugins-nextgen/level_zero/include/L0Memory.h    |  6 +++---
 offload/plugins-nextgen/level_zero/src/L0Device.cpp  |  4 +++-
 offload/plugins-nextgen/level_zero/src/L0Memory.cpp  | 12 +++++++-----
 offload/plugins-nextgen/level_zero/src/L0Options.cpp |  8 +++-----
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp  |  7 +++----
 6 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index b7e5d0215b357..f15b6e21d7a06 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -348,10 +348,10 @@ class L0DeviceTy final : public GenericDeviceTy {
   uint32_t getMaxSharedLocalMemory() const {
     return ComputeProperties.maxSharedLocalMemory;
   }
-  auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
-  auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
-  auto getCacheSize() const { return CacheProperties.cacheSize; }
-  auto getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
+  uint32_t getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  uint64_t getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+  size_t getCacheSize() const { return CacheProperties.cacheSize; }
+  uint64_t getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
 
   int32_t getAllocKind() const { return AllocKind; }
   DeviceArchTy getDeviceArch() const { return DeviceArch; }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 7229bb90cb7bb..e7c8a2cd2f634 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -214,12 +214,12 @@ class MemAllocatorTy {
     void printUsage();
 
     /// Initialize pool with allocation kind, allocator, and user options.
-    Error init(int32_t Kind, MemAllocatorTy *_Allocator,
+    Error init(int32_t Kind, MemAllocatorTy *Allocator,
                const L0OptionsTy &Option);
     // Initialize pool used for reduction pool
-    Error init(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option);
     // Initialize pool used for small memory pool with fixed parameters
-    Error init(MemAllocatorTy *_Allocator);
+    Error init(MemAllocatorTy *Allocator);
 
     /// Release resources used in the pool.
     Error deinit();
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 19f55c9a55aed..ed3da28ae43cf 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -213,7 +213,7 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
 
   IsAsyncEnabled =
       isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
-  if (auto Err = MemAllocator.initDevicePools(*this, getPlugin().getOptions()))
+  if (auto Err = MemAllocator.initDevicePools(*this, Options))
     return Err;
   l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
   return Plugin::success();
@@ -639,6 +639,7 @@ Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
     if (useImmForInterop()) {
       auto CmdListOrErr = createImmCmdList(InOrder);
       if (!CmdListOrErr) {
+        delete Ret->async_info;
         delete Ret;
         return CmdListOrErr.takeError();
       }
@@ -647,6 +648,7 @@ Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
     } else {
       auto QueueOrErr = createCommandQueue(InOrder);
       if (!QueueOrErr) {
+        delete Ret->async_info;
         delete Ret;
         return QueueOrErr.takeError();
       }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index be501eb1cabd1..656bb34afc269 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -430,6 +430,8 @@ Error MemAllocatorTy::deinit() {
     return Plugin::success();
 
   std::lock_guard<std::mutex> Lock(Mtx);
+  if (!L0Context)
+    return Plugin::success();
   // Release RTL-owned memory
   for (auto *M : MemOwned) {
     auto Err = deallocLocked(M);
@@ -501,7 +503,7 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
       (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
   const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
 
-  if ((Pools[Kind] != nullptr && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+  if ((Pools[Kind] && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
     // Pool is enabled for the allocation kind, and we do not use any memory
     // advice. We should avoid using pool if there is any meaningful memory
     // advice not to affect sibling allocation in the same block.
@@ -511,11 +513,11 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
     MemPoolTy *Pool = nullptr;
 
     if (UseScratchPool)
-      AllocBase = &ReductionPool;
+      Pool = ReductionPool.get();
     else if (UseZeroInitPool)
-      AllocBase = &CounterPool;
+      Pool = CounterPool.get();
     else
-      AllocBase = Pools[Kind].get();
+      Pool = Pools[Kind].get();
 
     auto PtrOrErr = Pool->alloc(AllocSize, PoolAllocSize);
     if (!PtrOrErr)
@@ -567,7 +569,7 @@ Error MemAllocatorTy::deallocLocked(void *Ptr) {
   }
   if (Info.InPool) {
     size_t DeallocSize = 0;
-    if (Pools[Info.Kind] != nullptr)
+    if (Pools[Info.Kind])
       DeallocSize = Pools[Info.Kind]->dealloc(Info.Base);
     if (DeallocSize == 0) {
       // Try reduction scratch pool
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index a5029e806c94b..76cf39ab0cd78 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -41,8 +41,7 @@ void L0OptionsTy::processEnvironmentVars() {
   if (MemoryPoolVar.isPresent()) {
     if (MemoryPoolVar.get() == "0") {
       Flags.UseMemoryPool = 0;
-      std::for_each(MemPoolConfig.begin(), MemPoolConfig.end(),
-                    [](auto &I) { I = {false, 0, 0, 0}; });
+      MemPoolConfig.fill({false, 0, 0, 0});
     } else {
       std::istringstream Str(MemoryPoolVar.get());
       int32_t MemType = -1;
@@ -51,7 +50,7 @@ void L0OptionsTy::processEnvironmentVars() {
       const std::array<int32_t, 3> DefaultValue{1, 4, 256};
       const int32_t AllMemType = INT32_MAX;
       std::array<int32_t, 3> AllInfo{1, 4, 256};
-      std::array<std::array<int32_t, 3>, 3> PoolInfo;
+      std::array<std::array<int32_t, 3>, 3> PoolInfo = {{{0, 0, 0}}};
       for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
         if (Token == "device") {
           MemType = TARGET_ALLOC_DEVICE;
@@ -95,8 +94,7 @@ void L0OptionsTy::processEnvironmentVars() {
             MemPoolConfig[TARGET_ALLOC_SHARED] = {true, AllInfo[0], AllInfo[1],
                                                   AllInfo[2]};
           } else {
-            std::for_each(MemPoolConfig.begin(), MemPoolConfig.end(),
-                          [](auto &I) { I = {false, 0, 0, 0}; });
+            MemPoolConfig.fill({false, 0, 0, 0});
           }
         } else {
           for (size_t Pool = 0; Pool < PoolInfo.size(); ++Pool) {
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index eb1e81e17300c..9ce5c48737828 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -96,17 +96,16 @@ Expected<int32_t> LevelZeroPluginTy::findDevices() {
 
   DP("Found %" PRIu32 " devices.\n", NumDevices);
   DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
-  for (auto &DeviceInfo : DetectedDevices) {
+    for (auto &DeviceInfo : DetectedDevices) {
     (void)DeviceInfo; // to avoid unused variable warning in non-debug builds
-    DP("-- Device %" PRIu32 "%s%s (zeDevice=" PRIu64 ") from Driver %p\n",
+    DP("-- Device %" PRIu32 "%s%s\n",
        DeviceInfo.Id.RootId,
        (DeviceInfo.Id.SubId < 0
             ? ""
             : ("." + std::to_string(DeviceInfo.Id.SubId)).c_str()),
        (DeviceInfo.Id.CCSId < 0
             ? ""
-            : ("." + std::to_string(DeviceInfo.Id.CCSId)).c_str()),
-       DPxPTR(DeviceInfo.Id.zeId), DPxPTR(DeviceInfo.Driver));
+            : ("." + std::to_string(DeviceInfo.Id.CCSId)).c_str()));
   }
   return NumDevices;
 }

>From 40ad342c18fd4c493c962061e39566dfdd111438 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 27 Oct 2025 17:31:48 +0100
Subject: [PATCH 63/70] format :/

---
 offload/plugins-nextgen/level_zero/include/L0Device.h | 8 ++++++--
 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp   | 5 ++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index f15b6e21d7a06..58d6b3ae0ef5a 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -348,10 +348,14 @@ class L0DeviceTy final : public GenericDeviceTy {
   uint32_t getMaxSharedLocalMemory() const {
     return ComputeProperties.maxSharedLocalMemory;
   }
-  uint32_t getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  uint32_t getMaxGroupSize() const {
+    return ComputeProperties.maxTotalGroupSize;
+  }
   uint64_t getGlobalMemorySize() const { return MemoryProperties.totalSize; }
   size_t getCacheSize() const { return CacheProperties.cacheSize; }
-  uint64_t getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
+  uint64_t getMaxMemAllocSize() const {
+    return DeviceProperties.maxMemAllocSize;
+  }
 
   int32_t getAllocKind() const { return AllocKind; }
   DeviceArchTy getDeviceArch() const { return DeviceArch; }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 9ce5c48737828..4e0a158ae1c23 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -96,10 +96,9 @@ Expected<int32_t> LevelZeroPluginTy::findDevices() {
 
   DP("Found %" PRIu32 " devices.\n", NumDevices);
   DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
-    for (auto &DeviceInfo : DetectedDevices) {
+  for (auto &DeviceInfo : DetectedDevices) {
     (void)DeviceInfo; // to avoid unused variable warning in non-debug builds
-    DP("-- Device %" PRIu32 "%s%s\n",
-       DeviceInfo.Id.RootId,
+    DP("-- Device %" PRIu32 "%s%s\n", DeviceInfo.Id.RootId,
        (DeviceInfo.Id.SubId < 0
             ? ""
             : ("." + std::to_string(DeviceInfo.Id.SubId)).c_str()),

>From 8bb25adc07981b4271670e73a1b8a506361648ca Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 29 Oct 2025 16:21:40 +0100
Subject: [PATCH 64/70] fixes

---
 .../plugins-nextgen/level_zero/include/L0Memory.h  |  2 +-
 .../plugins-nextgen/level_zero/src/L0Device.cpp    |  4 +++-
 .../plugins-nextgen/level_zero/src/L0Memory.cpp    | 14 ++++++++++----
 .../plugins-nextgen/level_zero/src/L0Plugin.cpp    | 11 +++--------
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index e7c8a2cd2f634..b6b19d96328da 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -330,7 +330,7 @@ class MemAllocatorTy {
   /// to support target pointer with offset, and positive "ActiveSize" is
   /// specified in such cases for correct debug logging.
   Expected<void *> allocL0(size_t Size, size_t Align, int32_t Kind,
-                           size_t ActiveSize = 0);
+                           size_t ActiveSize = 0, bool Logging = true);
 
   /// Allocate memory with the specified information from a memory pool
   Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index ed3da28ae43cf..3ec58ec963f19 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -156,7 +156,7 @@ std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
 }
 
 void L0DeviceTy::reportDeviceInfo() const {
-  DP("Device %" PRIu32 "\n", DeviceId);
+  DP("Device %" PRIu32 " information\n", DeviceId);
   DP("-- Name                         : %s\n", getNameCStr());
   DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
   DP("-- UUID                         : %s\n", getUuid().data());
@@ -216,6 +216,8 @@ Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
   if (auto Err = MemAllocator.initDevicePools(*this, Options))
     return Err;
   l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+  if (getDebugLevel() > 0)
+    reportDeviceInfo();
   return Plugin::success();
 }
 
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 656bb34afc269..921bcd7972a68 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -83,7 +83,9 @@ Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn,
 
   // Check page size used for this allocation kind to decide minimum
   // allocation size when allocating from L0.
-  auto MemOrErr = Allocator->allocL0(8, 0, AllocKind);
+  auto MemOrErr =
+      Allocator->allocL0(/* Size=*/8, /*Align=*/0, AllocKind, /*ActiveSize=*/0,
+                         /*Logging=*/false);
   if (!MemOrErr)
     return MemOrErr.takeError();
   void *Mem = *MemOrErr;
@@ -610,7 +612,8 @@ Error MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size) {
 }
 
 Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
-                                         int32_t Kind, size_t ActiveSize) {
+                                         int32_t Kind, size_t ActiveSize,
+                                         bool Logging) {
   void *Mem = nullptr;
   ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
                                         nullptr, 0, 0};
@@ -649,8 +652,11 @@ Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
     assert(0 && "Invalid target data allocation kind");
   }
 
-  size_t LoggedSize = ActiveSize ? ActiveSize : Size;
-  log(LoggedSize, LoggedSize, Kind);
+  if (Logging) {
+    size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+    log(LoggedSize, LoggedSize, Kind);
+  }
+
   if (makeResident) {
     assert(Device &&
            "Device is not set for memory allocation. Is this a Device Pool?");
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 4e0a158ae1c23..738f6d184d0fb 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -144,14 +144,9 @@ GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
                       (SubId < 0 ? "" : "." + std::to_string(SubId)) +
                       (CCSId < 0 ? "" : "." + std::to_string(CCSId));
 
-  auto *NewDevice = new L0DeviceTy(
-      static_cast<LevelZeroPluginTy &>(Plugin), DeviceId, NumDevices, zeDevice,
-      *zeDriver, std::move(IdStr), CCSId < 0 ? 0 : CCSId /* ComputeIndex */);
-  if (NewDevice && getDebugLevel() > 0) {
-    DP("Device %" PRIi32 " information\n", DeviceId);
-    NewDevice->reportDeviceInfo();
-  }
-  return NewDevice;
+  return new L0DeviceTy(static_cast<LevelZeroPluginTy &>(Plugin), DeviceId,
+                        NumDevices, zeDevice, *zeDriver, std::move(IdStr),
+                        CCSId < 0 ? 0 : CCSId /* ComputeIndex */);
 }
 
 GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {

>From 08d5f32d6457b96ffeda75d70ce1c53d54da6ea6 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 11:27:58 +0100
Subject: [PATCH 65/70] fix incorrect memory logging and MemPool small refactor

---
 .../level_zero/include/L0Memory.h             | 120 ++++++++++--------
 .../level_zero/src/L0Memory.cpp               |  74 ++++++-----
 2 files changed, 110 insertions(+), 84 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index b6b19d96328da..a895523bd0f15 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -69,7 +69,9 @@ struct MemAllocInfoTy {
   /// Base address allocated from compute runtime
   void *Base = nullptr;
   /// Allocation size known to users/libomptarget
-  size_t Size = 0;
+  size_t ReqSize = 0;
+  /// Allocation size known to the plugin (can be larger than ReqSize)
+  size_t AllocSize = 0;
   /// TARGET_ALLOC kind
   int32_t Kind = TARGET_ALLOC_DEFAULT;
   /// Allocation from pool?
@@ -79,10 +81,10 @@ struct MemAllocInfoTy {
 
   MemAllocInfoTy() = default;
 
-  MemAllocInfoTy(void *Base, size_t Size, int32_t Kind, bool InPool,
-                 bool ImplicitArg)
-      : Base(Base), Size(Size), Kind(Kind), InPool(InPool),
-        ImplicitArg(ImplicitArg) {}
+  MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind,
+                 bool InPool, bool ImplicitArg)
+      : Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind),
+        InPool(InPool), ImplicitArg(ImplicitArg) {}
 };
 
 /// Responsible for all activities involving memory allocation/deallocation.
@@ -240,8 +242,8 @@ class MemAllocatorTy {
 
   public:
     /// Add allocation information to the map
-    void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
-             bool InPool = false, bool ImplicitArg = false);
+    void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize,
+             int32_t Kind, bool InPool = false, bool ImplicitArg = false);
 
     /// Remove allocation information for the given memory location
     bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
@@ -265,7 +267,7 @@ class MemAllocatorTy {
       --I;
       bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
                  (uintptr_t)Ptr + (uintptr_t)Size <=
-                     (uintptr_t)I->first + (uintptr_t)I->second.Size;
+                     (uintptr_t)I->first + (uintptr_t)I->second.ReqSize;
       return Ret;
     }
 
@@ -307,6 +309,59 @@ class MemAllocatorTy {
   // hondling the Mtx lock
   Error deallocLocked(void *Ptr);
 
+  /// Allocate memory from L0 GPU RT
+  Expected<void *> allocFromL0(size_t Size, size_t Align, int32_t Kind);
+  /// Deallocate memory from L0 GPU RT
+  Error deallocFromL0(void *Ptr);
+
+  /// We use over-allocation workaround to support target pointer with
+  /// offset, and positive "ActiveSize" is specified in such cases to
+  /// correct debug logging.
+  Expected<void *> allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind,
+                                     size_t ActiveSize = 0) {
+    auto MemOrErr = allocFromL0(Size, Align, Kind);
+    if (!MemOrErr)
+      return MemOrErr;
+    size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+    log(LoggedSize, Size, Kind);
+    return MemOrErr;
+  }
+
+  /// Log memory allocation/deallocation
+  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+    if (Kind < 0 || Kind >= MaxMemKind)
+      return; // Stat is disabled
+
+    auto &ST = Stats[Kind];
+    int32_t I = Pool ? 1 : 0;
+    if (ReqSize > 0) {
+      ST.Requested[I] += ReqSize;
+      ST.Allocated[I] += Size;
+      ST.InUse[I] += Size;
+      ST.NumAllocs[I]++;
+    } else {
+      ST.Freed[I] += Size;
+      ST.InUse[I] -= Size;
+    }
+    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+  }
+
+  /// Perform copy operation
+  Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+  /// Perform memory fill operation
+  Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
+
+  /// Allocate memory with the specified information from a memory pool
+  Expected<void *> allocFromPool(size_t Size, size_t Align, int32_t Kind,
+                                 intptr_t Offset, bool UserAlloc,
+                                 bool DevMalloc, uint32_t MemAdvice,
+                                 AllocOptionTy AllocOpt);
+  /// Deallocate memory from memory pool
+  Error deallocFromPool(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return deallocLocked(Ptr);
+  }
+
 public:
   MemAllocatorTy() : MaxAllocSize(std::numeric_limits<uint64_t>::max()) {}
 
@@ -316,32 +371,23 @@ class MemAllocatorTy {
   MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
   ~MemAllocatorTy() {}
 
-  /// Release resources and report statistics if requested
-  Error deinit();
-
-  /// Allocator only supports host memory
-  bool supportsHostMem() { return IsHostMem; }
-
   Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
   Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
   void updateMaxAllocSize(L0DeviceTy &L0Device);
 
-  /// Allocate memory from L0 GPU RT. We use over-allocation workaround
-  /// to support target pointer with offset, and positive "ActiveSize" is
-  /// specified in such cases for correct debug logging.
-  Expected<void *> allocL0(size_t Size, size_t Align, int32_t Kind,
-                           size_t ActiveSize = 0, bool Logging = true);
+  /// Release resources and report statistics if requested
+  Error deinit();
 
   /// Allocate memory with the specified information from a memory pool
   Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
                          intptr_t Offset, bool UserAlloc, bool DevMalloc,
-                         uint32_t MemAdvice, AllocOptionTy AllocOpt);
+                         uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+    return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+                         MemAdvice, AllocOpt);
+  }
 
   /// Deallocate memory
-  Error dealloc(void *Ptr) {
-    std::lock_guard<std::mutex> Lock(Mtx);
-    return deallocLocked(Ptr);
-  }
+  Error dealloc(void *Ptr) { return deallocFromPool(Ptr); }
 
   /// Check if the given memory location and offset belongs to any allocated
   /// memory
@@ -368,32 +414,6 @@ class MemAllocatorTy {
       Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
     return Ret;
   }
-
-  /// Log memory allocation/deallocation
-  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
-    if (Kind < 0 || Kind >= MaxMemKind)
-      return; // Stat is disabled
-
-    auto &ST = Stats[Kind];
-    int32_t I = Pool ? 1 : 0;
-    if (ReqSize > 0) {
-      ST.Requested[I] += ReqSize;
-      ST.Allocated[I] += Size;
-      ST.InUse[I] += Size;
-      ST.NumAllocs[I]++;
-    } else {
-      ST.Freed[I] += Size;
-      ST.InUse[I] -= Size;
-    }
-    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
-  }
-
-  /// Perform copy operation
-  Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
-
-  /// Perform memory fill operation
-  Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
-
 }; /// MemAllocatorTy
 
 // simple generic wrapper to reuse objects
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 921bcd7972a68..5ad371a92db53 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -83,9 +83,7 @@ Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn,
 
   // Check page size used for this allocation kind to decide minimum
   // allocation size when allocating from L0.
-  auto MemOrErr =
-      Allocator->allocL0(/* Size=*/8, /*Align=*/0, AllocKind, /*ActiveSize=*/0,
-                         /*Logging=*/false);
+  auto MemOrErr = Allocator->allocFromL0(8, 0, AllocKind);
   if (!MemOrErr)
     return MemOrErr.takeError();
   void *Mem = *MemOrErr;
@@ -94,7 +92,8 @@ Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn,
       ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
   CALL_ZE_RET_ERROR(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
   AllocUnit = (std::max)(AP.pageSize, AllocUnit);
-  CALL_ZE_RET_ERROR(zeMemFree, Context, Mem);
+  if (auto Err = Allocator->deallocFromL0(Mem))
+    return Err;
 
   bool IsDiscrete = false;
   if (Device) {
@@ -250,9 +249,11 @@ Error MemAllocatorTy::MemPoolTy::deinit() {
     for (auto *Block : Bucket) {
       if (DebugLevel > 0)
         Allocator->log(0, Block->Size, AllocKind);
-      CALL_ZE_RET_ERROR(zeMemFree, Allocator->L0Context->getZeContext(),
-                        reinterpret_cast<void *>(Block->Base));
+      auto Err =
+          Allocator->deallocFromL0(reinterpret_cast<void *>(Block->Base));
       delete Block;
+      if (Err)
+        return Err;
     }
   }
   return Plugin::success();
@@ -287,12 +288,11 @@ Expected<void *> MemAllocatorTy::MemPoolTy::alloc(size_t Size,
     // Bucket is empty or all blocks in the bucket are full
     const auto ChunkSize = BucketParams[BucketId].first;
     const auto BlockSize = BucketParams[BucketId].second;
-    auto BaseOrErr = Allocator->allocL0(BlockSize, 0, AllocKind);
+    auto BaseOrErr = Allocator->allocFromL0AndLog(BlockSize, 0, AllocKind);
     if (!BaseOrErr)
       return BaseOrErr.takeError();
 
     void *Base = *BaseOrErr;
-
     if (ZeroInit) {
       auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
       if (Err)
@@ -330,22 +330,22 @@ size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
   return Deallocated;
 }
 
-void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
-                                            int32_t Kind, bool InPool,
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t ReqSize,
+                                            size_t AllocSize, int32_t Kind, bool InPool,
                                             bool ImplicitArg) {
   const auto Inserted =
-      Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+      Map.emplace(Ptr, MemAllocInfoTy{Base, ReqSize, AllocSize, Kind, InPool, ImplicitArg});
   // Check if we keep valid disjoint memory ranges.
   [[maybe_unused]] bool Valid = Inserted.second;
   if (Valid) {
     if (Inserted.first != Map.begin()) {
       const auto I = std::prev(Inserted.first, 1);
-      Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+      Valid = Valid && (uintptr_t)I->first + I->second.ReqSize <= (uintptr_t)Ptr;
     }
     if (Valid) {
       const auto I = std::next(Inserted.first, 1);
       if (I != Map.end())
-        Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+        Valid = Valid && (uintptr_t)Ptr + ReqSize <= (uintptr_t)I->first;
     }
   }
   assert(Valid && "Invalid overlapping memory allocation");
@@ -483,10 +483,11 @@ Error MemAllocatorTy::deinit() {
 }
 
 /// Allocate memory with the specified information
-Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
-                                       intptr_t Offset, bool UserAlloc,
-                                       bool DevMalloc, uint32_t MemAdvice,
-                                       AllocOptionTy AllocOpt) {
+Expected<void *> MemAllocatorTy::allocFromPool(size_t Size, size_t Align,
+                                               int32_t Kind, intptr_t Offset,
+                                               bool UserAlloc, bool DevMalloc,
+                                               uint32_t MemAdvice,
+                                               AllocOptionTy AllocOpt) {
   assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
           Kind == TARGET_ALLOC_SHARED) &&
          "Unknown memory kind while allocating target memory");
@@ -530,7 +531,7 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
       if (Align > 0)
         Base = (Base + Align) & ~(Align - 1);
       Mem = (void *)(Base + Offset);
-      AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+      AllocInfo.add(Mem, AllocBase, Size, PoolAllocSize, Kind, true, UserAlloc);
       log(Size, PoolAllocSize, Kind, true /* Pool */);
       if (DevMalloc)
         MemOwned.push_back(AllocBase);
@@ -542,13 +543,14 @@ Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
     }
   }
 
-  auto AllocBaseOrErr = allocL0(AllocSize, Align, Kind, Size);
+  auto AllocBaseOrErr =
+      allocFromL0AndLog(AllocSize, Align, Kind, /*ActiveSize=*/Size);
   if (!AllocBaseOrErr)
     return AllocBaseOrErr.takeError();
   AllocBase = *AllocBaseOrErr;
   if (AllocBase) {
     Mem = (void *)((uintptr_t)AllocBase + Offset);
-    AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+    AllocInfo.add(Mem, AllocBase, Size, AllocSize, Kind, false, UserAlloc);
     if (DevMalloc)
       MemOwned.push_back(AllocBase);
     if (UseDedicatedPool) {
@@ -594,11 +596,12 @@ Error MemAllocatorTy::deallocLocked(void *Ptr) {
                          "Cannot find base address of " DPxMOD "\n",
                          DPxPTR(Ptr));
   }
-  CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Info.Base);
-  log(0, Info.Size, Info.Kind);
+  log(0, Info.AllocSize, Info.Kind);
 
+  if (auto Err = deallocFromL0(Info.Base))
+    return Err;
   DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
-     DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+     DPxPTR(Ptr), DPxPTR(Info.Base), Info.AllocSize);
 
   return Plugin::success();
 }
@@ -611,9 +614,8 @@ Error MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size) {
   return Device->enqueueMemCopy(Dst, Src, Size);
 }
 
-Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
-                                         int32_t Kind, size_t ActiveSize,
-                                         bool Logging) {
+Expected<void *> MemAllocatorTy::allocFromL0(size_t Size, size_t Align,
+                                             int32_t Kind) {
   void *Mem = nullptr;
   ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
                                         nullptr, 0, 0};
@@ -637,26 +639,24 @@ Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
     makeResident = true;
     CALL_ZE_RET_ERROR(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
                       zeDevice, &Mem);
-    DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+    DP("Allocated %" PRId64 " bytes of device memory " DPxMOD "\n", Size,
+       DPxPTR(Mem));
     break;
   case TARGET_ALLOC_HOST:
     CALL_ZE_RET_ERROR(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
-    DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+    DP("Allocated %" PRId64 " bytes of host memory " DPxMOD "\n", Size,
+       DPxPTR(Mem));
     break;
   case TARGET_ALLOC_SHARED:
     CALL_ZE_RET_ERROR(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
                       Align, zeDevice, &Mem);
-    DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+    DP("Allocated %" PRId64 " bytes of shared memory " DPxMOD "\n", Size,
+       DPxPTR(Mem));
     break;
   default:
     assert(0 && "Invalid target data allocation kind");
   }
 
-  if (Logging) {
-    size_t LoggedSize = ActiveSize ? ActiveSize : Size;
-    log(LoggedSize, LoggedSize, Kind);
-  }
-
   if (makeResident) {
     assert(Device &&
            "Device is not set for memory allocation. Is this a Device Pool?");
@@ -668,6 +668,12 @@ Expected<void *> MemAllocatorTy::allocL0(size_t Size, size_t Align,
   return Mem;
 }
 
+Error MemAllocatorTy::deallocFromL0 (void *Ptr) {
+  CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Ptr);
+  DP("Freed device pointer " DPxMOD "\n", DPxPTR(Ptr));
+  return Plugin::success();
+}
+
 Expected<ze_event_handle_t> EventPoolTy::getEvent() {
   std::lock_guard<std::mutex> Lock(*Mtx);
 

>From ba087096de07fdffb95f9fe528457c4575d80ae3 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 12:22:47 +0100
Subject: [PATCH 66/70] Refactor code from comments

---
 .../level_zero/include/L0Kernel.h             |  4 --
 .../level_zero/include/L0Trace.h              | 37 -----------
 .../level_zero/src/L0Kernel.cpp               | 62 ++++++++-----------
 3 files changed, 26 insertions(+), 77 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index e07ae81c0ff7a..eb071b4018c2d 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -110,10 +110,6 @@ class L0KernelTy : public GenericKernelTy {
   KernelPropertiesTy Properties;
   KernelPropertiesTy &getProperties() { return Properties; }
 
-  Error runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
-                            KernelLaunchParamsTy LaunchParams,
-                            __tgt_async_info *AsyncInfo) const;
-
   void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
                                   uint32_t ThreadLimit,
                                   TgtNDRangeDescTy *LoopLevels,
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index 22170b723a31b..9a82da665f560 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -41,15 +41,6 @@
       Rc = Fn(__VA_ARGS__);                                                    \
   } while (0)
 
-#define CALL_ZE_RC(Rc, Fn, ...)                                                \
-  do {                                                                         \
-    CALL_ZE(Rc, Fn, __VA_ARGS__);                                              \
-    if (Rc != ZE_RESULT_SUCCESS) {                                             \
-      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
-         getZeErrorName(Rc));                                                  \
-    }                                                                          \
-  } while(0)
-
 /// For non-thread-safe functions
 #define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...)                                     \
   do {                                                                         \
@@ -64,12 +55,6 @@
     }                                                                          \
   } while (0)
 
-#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...)                                     \
-  CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
-#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...)                                     \
-  CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
-#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
-  CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
 #define CALL_ZE_RET_ERROR_MTX(Fn, Mtx, ...)                                   \
   CALL_ZE_RET_MTX(                                                            \
     Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",          \
@@ -88,32 +73,11 @@
     }                                                                          \
   } while (0)
 
-#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
-#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
-#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
-#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
 #define CALL_ZE_RET_ERROR(Fn, ...)                                             \
   CALL_ZE_RET(                                                                 \
     Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",           \
     STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
 
-
-
-#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...)                                     \
-  do {                                                                         \
-    ze_result_t rc;                                                            \
-    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
-    if (rc != ZE_RESULT_SUCCESS) {                                             \
-      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
-         getZeErrorName(rc));                                                  \
-      const char *err_str = nullptr;                                           \
-      rc = zeDriverGetLastErrorDescription(                                    \
-          Dev.getDriverHandle(), &err_str);                                    \
-      fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn,          \
-              err_str);                                                        \
-    }                                                                          \
-  } while (0)
-
 #define CALL_ZE_EXIT_FAIL(Fn, ...)                                             \
   do {                                                                         \
     ze_result_t rc;                                                            \
@@ -133,7 +97,6 @@
       return Ret;                                                              \
   } while (0)
 
-
 #define CALL_ZE_EXT_RET_ERROR(Device, Name, ...)                               \
   CALL_ZE_EXT_SILENT_RET(Device,                                               \
       Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s",          \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index c04ca74287cd7..1ad73ba4969be 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -17,17 +17,6 @@
 
 namespace llvm::omp::target::plugin {
 
-Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
-                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                             KernelArgsTy &KernelArgs,
-                             KernelLaunchParamsTy LaunchParams,
-                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-
-  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
-  return runTargetTeamRegion(l0Device, KernelArgs, std::move(LaunchParams),
-                             AsyncInfoWrapper);
-}
-
 Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
   const auto *KernelName = getName();
 
@@ -42,8 +31,7 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
                            DeviceImageTy &Image) {
   auto &Program = L0ProgramTy::makeL0Program(Image);
 
-  Error Err = buildKernel(Program);
-  if (Err)
+  if (auto Err = buildKernel(Program))
     return Err;
   Program.addKernel(this);
 
@@ -442,10 +430,14 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
   return Plugin::success();
 }
 
-Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
-                                      KernelArgsTy &KernelArgs,
-                                      KernelLaunchParamsTy LaunchParams,
-                                      __tgt_async_info *AsyncInfo) const {
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                             KernelArgsTy &KernelArgs,
+                             KernelLaunchParamsTy LaunchParams,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+  __tgt_async_info *AsyncInfo = AsyncInfoWrapper;
+
   // Libomptarget can pass negative NumTeams and ThreadLimit now after
   // introducing __tgt_target_kernel. This happens only when we have valid
   // LoopDesc and the region is not a teams region.
@@ -461,15 +453,13 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
     NumTeams = 0;
   if (ThreadLimit < 0)
     ThreadLimit = 0;
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
+       DPxPTR(zeKernel));
 
   auto &Plugin = l0Device.getPlugin();
-  auto &Device = Plugin.getDeviceFromId(DeviceId);
-
-  auto *IdStr = Device.getZeIdCStr();
+  auto *IdStr = l0Device.getZeIdCStr();
   auto &Options = LevelZeroPluginTy::getOptions();
-  bool IsAsync = AsyncInfo && Device.asyncEnabled();
+  bool IsAsync = AsyncInfo && l0Device.asyncEnabled();
   if (IsAsync && !AsyncInfo->Queue) {
     AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
     if (!AsyncInfo->Queue)
@@ -496,7 +486,7 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
       GroupSizes, GroupCounts, AllowCooperative);
 
   if (!GroupParamsReused) {
-    if (auto Err = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+    if (auto Err = getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes,
                                   GroupCounts, LoopDesc, AllowCooperative))
       return Err;
     KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
@@ -526,12 +516,12 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   // Set Kernel Indirect flags
   auto &PrevFlags = KernelPR.IndirectAccessFlags;
   ze_kernel_indirect_access_flags_t Flags = 0;
-  Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
-  Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
 
   if (PrevFlags != Flags) {
     // Combine with common access flags
-    const auto FinalFlags = Device.getIndirectFlags() | Flags;
+    const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
     CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
     DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
     PrevFlags = Flags;
@@ -544,20 +534,20 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
 
   ze_command_list_handle_t CmdList = nullptr;
   ze_command_queue_handle_t CmdQueue = nullptr;
-  const bool UseImmCmdList = Device.useImmForCompute();
+  const bool UseImmCmdList = l0Device.useImmForCompute();
 
   if (UseImmCmdList) {
-    auto CmdListOrErr = Device.getImmCmdList();
+    auto CmdListOrErr = l0Device.getImmCmdList();
     if (!CmdListOrErr)
       return CmdListOrErr.takeError();
     CmdList = *CmdListOrErr;
     // Command queue is not used with immediate command list
   } else {
-    auto CmdListOrErr = Device.getCmdList();
+    auto CmdListOrErr = l0Device.getCmdList();
     if (!CmdListOrErr)
       return CmdListOrErr.takeError();
     CmdList = *CmdListOrErr;
-    auto CmdQueueOrErr = Device.getCmdQueue();
+    auto CmdQueueOrErr = l0Device.getCmdQueue();
     if (!CmdQueueOrErr)
       return CmdQueueOrErr.takeError();
     CmdQueue = *CmdQueueOrErr;
@@ -566,7 +556,7 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
   if (UseImmCmdList) {
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Using immediate command list for kernel submission.\n");
-    auto EventOrError = Device.getEvent();
+    auto EventOrError = l0Device.getEvent();
     if (!EventOrError)
       return EventOrError.takeError();
     ze_event_handle_t Event = *EventOrError;
@@ -596,7 +586,7 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
       AsyncQueue->KernelEvent = Event;
     } else {
       CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
-      if (auto Err = Device.releaseEvent(Event))
+      if (auto Err = l0Device.releaseEvent(Event))
         return Err;
     }
   } else {
@@ -609,14 +599,14 @@ Error L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
                         &GroupCounts, Event, 0, nullptr);
     KernelLock.unlock();
     CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
-    CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
-                          CmdQueue, 1, &CmdList, nullptr);
+    CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists,
+                          l0Device.getMutex(), CmdQueue, 1, &CmdList, nullptr);
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
     CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
     CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
     if (Event) {
-      if (auto Err = Device.releaseEvent(Event))
+      if (auto Err = l0Device.releaseEvent(Event))
         return Err;
     }
   }

>From 69c0db392964b2a29939f9b34faba2a386b7d7f4 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 22:46:56 +0100
Subject: [PATCH 67/70] Restructure kernel launching code

---
 .../level_zero/include/L0Kernel.h             |  73 ++--
 .../level_zero/src/L0Kernel.cpp               | 375 ++++++++++--------
 2 files changed, 251 insertions(+), 197 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index eb071b4018c2d..48cb4f7a6de40 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -13,6 +13,7 @@
 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
 
+#include "AsyncQueue.h"
 #include "L0Defs.h"
 #include "L0Trace.h"
 #include "PluginInterface.h"
@@ -48,6 +49,9 @@ struct TgtNDRangeDescTy {
   }
 };
 
+/// forward declaration
+struct L0LaunchEnvTy;
+
 /// Kernel properties.
 struct KernelPropertiesTy {
   uint32_t Width = 0;
@@ -70,37 +74,28 @@ struct KernelPropertiesTy {
   static constexpr TgtNDRangeDescTy LoopDescInit = {};
 
   /// Check if we can reuse group parameters.
-  bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
-                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
-                        uint32_t *GroupSizesOut,
-                        ze_group_count_t &GroupCountsOut,
-                        bool &AllowCooperativeOut) const {
-    if (!LoopDescPtr && LoopDescInit != LoopDesc)
-      return false;
-    if (LoopDescPtr && *LoopDescPtr != LoopDesc)
-      return false;
-    if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
-      return false;
-    // Found matching input parameters.
-    std::copy_n(GroupSizes, 3, GroupSizesOut);
-    GroupCountsOut = GroupCounts;
-    AllowCooperativeOut = AllowCooperative;
-    return true;
-  }
+  bool reuseGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        uint32_t *GroupSizesOut, L0LaunchEnvTy &KEnv) const;
 
   /// Update cached group parameters.
-  void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
-                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
-                        const uint32_t *GroupSizesIn,
-                        const ze_group_count_t &GroupCountsIn,
-                        const bool &AllowCooperativeIn) {
-    LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
-    NumTeams = NumTeamsIn;
-    ThreadLimit = ThreadLimitIn;
-    std::copy_n(GroupSizesIn, 3, GroupSizes);
-    GroupCounts = GroupCountsIn;
-    AllowCooperative = AllowCooperativeIn;
-  }
+  void cacheGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        const uint32_t *GroupSizesIn, L0LaunchEnvTy &KEnv);
+};
+
+struct L0LaunchEnvTy {
+  bool IsAsync;
+  AsyncQueueTy *AsyncQueue;
+  ze_group_count_t GroupCounts;
+  KernelPropertiesTy &KernelPR;
+  bool HalfNumThreads = false;
+  bool IsTeamsNDRange = false;
+
+  bool AllowCooperative = false;
+  TgtNDRangeDescTy *LoopDesc = nullptr;
+
+  L0LaunchEnvTy(bool IsAsync, AsyncQueueTy *AsyncQueue,
+                KernelPropertiesTy &KernelPR)
+      : IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR) {}
 };
 
 class L0KernelTy : public GenericKernelTy {
@@ -111,22 +106,19 @@ class L0KernelTy : public GenericKernelTy {
   KernelPropertiesTy &getProperties() { return Properties; }
 
   void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
-                                  uint32_t ThreadLimit,
-                                  TgtNDRangeDescTy *LoopLevels,
-                                  uint32_t *GroupSizes,
-                                  ze_group_count_t &GroupCounts,
-                                  bool HalfNumThreads,
-                                  bool IsTeamsNDRange) const;
+                                  uint32_t ThreadLimit, uint32_t *GroupSizes,
+                                  L0LaunchEnvTy &KEnv) const;
 
   Error decideLoopKernelGroupArguments(L0DeviceTy &Device, uint32_t ThreadLimit,
-                                       TgtNDRangeDescTy *LoopLevels,
                                        uint32_t *GroupSizes,
-                                       ze_group_count_t &GroupCounts,
-                                       bool HalfNumThreads,
-                                       bool &AllowCooperative) const;
+                                       L0LaunchEnvTy &KEnv) const;
 
   Error buildKernel(L0ProgramTy &Program);
 
+  Error setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
+                        int32_t NumTeams, int32_t ThreadLimit) const;
+  Error setIndirectFlags(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv) const;
+
 public:
   /// Create a L0 kernel with a name and an execution mode.
   L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
@@ -160,8 +152,7 @@ class L0KernelTy : public GenericKernelTy {
 
   Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                        int32_t ThreadLimit, uint32_t *GroupSizes,
-                       ze_group_count_t &GroupCounts, void *LoopDesc,
-                       bool &AllowCooperative) const;
+                       L0LaunchEnvTy &KEnv) const;
 };
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 1ad73ba4969be..c2227f06d3490 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -17,6 +17,35 @@
 
 namespace llvm::omp::target::plugin {
 
+bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
+                                          const int32_t ThreadLimitIn,
+                                          uint32_t *GroupSizesOut,
+                                          L0LaunchEnvTy &KEnv) const {
+  if (!KEnv.LoopDesc && LoopDescInit != LoopDesc)
+    return false;
+  if (KEnv.LoopDesc && *KEnv.LoopDesc != LoopDesc)
+    return false;
+  if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
+    return false;
+  // Found matching input parameters.
+  std::copy_n(GroupSizes, 3, GroupSizesOut);
+  KEnv.GroupCounts = GroupCounts;
+  KEnv.AllowCooperative = AllowCooperative;
+  return true;
+}
+
+void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
+                                          const int32_t ThreadLimitIn,
+                                          const uint32_t *GroupSizesIn,
+                                          L0LaunchEnvTy &KEnv) {
+  LoopDesc = KEnv.LoopDesc ? *KEnv.LoopDesc : LoopDescInit;
+  NumTeams = NumTeamsIn;
+  ThreadLimit = ThreadLimitIn;
+  std::copy_n(GroupSizesIn, 3, GroupSizes);
+  GroupCounts = KEnv.GroupCounts;
+  AllowCooperative = KEnv.AllowCooperative;
+}
+
 Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
   const auto *KernelName = getName();
 
@@ -38,11 +67,11 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
   return Plugin::success();
 }
 
-void L0KernelTy::decideKernelGroupArguments(
-    L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
-    TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
-    ze_group_count_t &GroupCounts, bool HalfNumThreads,
-    bool IsTeamsNDRange) const {
+void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
+                                            uint32_t NumTeams,
+                                            uint32_t ThreadLimit,
+                                            uint32_t *GroupSizes,
+                                            L0LaunchEnvTy &KEnv) const {
 
   const KernelPropertiesTy &KernelPR = getProperties();
 
@@ -52,6 +81,7 @@ void L0KernelTy::decideKernelGroupArguments(
   uint32_t MaxGroupSize = Device.getMaxGroupSize();
   const auto &Option = LevelZeroPluginTy::getOptions();
   const auto OptSubscRate = Option.SubscriptionRate;
+  auto &GroupCounts = KEnv.GroupCounts;
 
   uint32_t SIMDWidth = KernelPR.SIMDWidth;
   uint32_t KernelWidth = KernelPR.Width;
@@ -86,7 +116,7 @@ void L0KernelTy::decideKernelGroupArguments(
   } else {
     const uint32_t NumSubslices = Device.getNumSubslices();
     uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
-    if (HalfNumThreads)
+    if (KEnv.HalfNumThreads)
       NumThreadsPerSubslice /= 2;
 
     MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
@@ -125,6 +155,7 @@ void L0KernelTy::decideKernelGroupArguments(
     }
 
     size_t LoopTripcount = 0;
+    TgtNDRangeDescTy *LoopLevels = KEnv.LoopDesc;
     if (LoopLevels) {
       // TODO: consider other possible LoopDesc uses
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
@@ -151,9 +182,10 @@ void L0KernelTy::decideKernelGroupArguments(
       const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
                                      Device.getNumSubslices() * SIMDWidth;
       size_t AdjustedGroupCount =
-          IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
-                                      MaxTotalThreads / GRPSizes[0])
-                         : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+          KEnv.IsTeamsNDRange
+              ? (std::min)(((LoopTripcount + 7) & ~7),
+                           MaxTotalThreads / GRPSizes[0])
+              : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
       AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
       AdjustedGroupCount *= OptSubscRate;
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
@@ -206,15 +238,17 @@ static uint64_t computeThreadsNeeded(const llvm::ArrayRef<size_t> TripCounts,
   return GroupCount[0] * ThreadsPerWG;
 }
 
-Error L0KernelTy::decideLoopKernelGroupArguments(
-    L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
-    uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
-    bool &AllowCooperative) const {
+Error L0KernelTy::decideLoopKernelGroupArguments(L0DeviceTy &Device,
+                                                 uint32_t ThreadLimit,
+                                                 uint32_t *GroupSizes,
+                                                 L0LaunchEnvTy &KEnv) const {
 
   const auto DeviceId = Device.getDeviceId();
   const auto &Options = LevelZeroPluginTy::getOptions();
   const auto &KernelPR = getProperties();
   uint32_t MaxGroupSize = Device.getMaxGroupSize();
+  TgtNDRangeDescTy *LoopLevels = KEnv.LoopDesc;
+  auto &GroupCounts = KEnv.GroupCounts;
 
   bool MaxGroupSizeForced = false;
   if (ThreadLimit > 0) {
@@ -270,7 +304,7 @@ Error L0KernelTy::decideLoopKernelGroupArguments(
       }
       GRPCounts[DistributeDim] = DistributeTripCount;
     }
-    AllowCooperative = false;
+    KEnv.AllowCooperative = false;
     GroupCounts.groupCountX = GRPCounts[0];
     GroupCounts.groupCountY = GRPCounts[1];
     GroupCounts.groupCountZ = GRPCounts[2];
@@ -346,7 +380,7 @@ Error L0KernelTy::decideLoopKernelGroupArguments(
     }
     GRPCounts[I] = (uint32_t)Count;
   }
-  AllowCooperative = false;
+  KEnv.AllowCooperative = false;
   GroupCounts.groupCountX = GRPCounts[0];
   GroupCounts.groupCountY = GRPCounts[1];
   GroupCounts.groupCountZ = GRPCounts[2];
@@ -357,8 +391,7 @@ Error L0KernelTy::decideLoopKernelGroupArguments(
 
 Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                                  int32_t ThreadLimit, uint32_t *GroupSizes,
-                                 ze_group_count_t &GroupCounts, void *LoopDesc,
-                                 bool &AllowCooperative) const {
+                                 L0LaunchEnvTy &KEnv) const {
 
   const auto DeviceId = Device.getDeviceId();
   const auto &KernelPR = getProperties();
@@ -367,7 +400,7 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
   const auto [NumTeamsICV, ThreadLimitICV] = std::make_tuple(0, 0);
 
   bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
-  bool HalfNumThreads =
+  KEnv.HalfNumThreads =
       LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
   uint32_t KernelWidth = KernelPR.Width;
   uint32_t SIMDWidth = KernelPR.SIMDWidth;
@@ -419,12 +452,161 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
          NumTeams);
     }
 
-    bool UseLoopTC = LoopDesc;
-    decideKernelGroupArguments(
-        Device, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
-        UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
-        GroupCounts, HalfNumThreads, false);
-    AllowCooperative = false;
+    decideKernelGroupArguments(Device, (uint32_t)NumTeams,
+                               (uint32_t)ThreadLimit, GroupSizes, KEnv);
+    KEnv.AllowCooperative = false;
+  }
+
+  return Plugin::success();
+}
+
+static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
+                                        ze_kernel_handle_t zeKernel,
+                                        L0LaunchEnvTy &KEnv,
+                                        CommandModeTy CommandMode) {
+  const auto DeviceId = l0Device.getDeviceId();
+  auto *IdStr = l0Device.getZeIdCStr();
+  auto CmdListOrErr = l0Device.getImmCmdList();
+  if (!CmdListOrErr)
+    return CmdListOrErr.takeError();
+  const ze_command_list_handle_t CmdList = *CmdListOrErr;
+  // Command queue is not used with immediate command list
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Using immediate command list for kernel submission.\n");
+  auto EventOrError = l0Device.getEvent();
+  if (!EventOrError)
+    return EventOrError.takeError();
+  ze_event_handle_t Event = *EventOrError;
+  size_t NumWaitEvents = 0;
+  ze_event_handle_t *WaitEvents = nullptr;
+  auto *AsyncQueue = KEnv.AsyncQueue;
+  if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) {
+    if (CommandMode == CommandModeTy::AsyncOrdered) {
+      NumWaitEvents = 1;
+      WaitEvents = &AsyncQueue->WaitEvents.back();
+    } else {
+      NumWaitEvents = AsyncQueue->WaitEvents.size();
+      WaitEvents = AsyncQueue->WaitEvents.data();
+    }
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+  if (KEnv.AllowCooperative)
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                      zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
+                      WaitEvents);
+  else
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                      &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+  KEnv.KernelPR.Mtx.unlock();
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+
+  if (KEnv.IsAsync) {
+    AsyncQueue->WaitEvents.push_back(Event);
+    AsyncQueue->KernelEvent = Event;
+  } else {
+    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
+    if (auto Err = l0Device.releaseEvent(Event))
+      return Err;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return Plugin::success();
+}
+
+static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
+                                      ze_kernel_handle_t zeKernel,
+                                      L0LaunchEnvTy &KEnv) {
+  const auto DeviceId = l0Device.getDeviceId();
+  const auto *IdStr = l0Device.getZeIdCStr();
+
+  auto CmdListOrErr = l0Device.getCmdList();
+  if (!CmdListOrErr)
+    return CmdListOrErr.takeError();
+  ze_command_list_handle_t CmdList = *CmdListOrErr;
+  auto CmdQueueOrErr = l0Device.getCmdQueue();
+  if (!CmdQueueOrErr)
+    return CmdQueueOrErr.takeError();
+  const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr;
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Using regular command list for kernel submission.\n");
+
+  ze_event_handle_t Event = nullptr;
+  if (KEnv.AllowCooperative)
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                      zeKernel, &KEnv.GroupCounts, Event, 0, nullptr);
+  else
+    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                      &KEnv.GroupCounts, Event, 0, nullptr);
+  KEnv.KernelPR.Mtx.unlock();
+  CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+  CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
+                        CmdQueue, 1, &CmdList, nullptr);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+  CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+  CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  if (Event) {
+    if (auto Err = l0Device.releaseEvent(Event))
+      return Err;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return Plugin::success();
+}
+
+Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
+                                  int32_t NumTeams, int32_t ThreadLimit) const {
+  uint32_t GroupSizes[3];
+  auto DeviceId = l0Device.getDeviceId();
+  auto &KernelPR = KEnv.KernelPR;
+  // Check if we can reuse previous group parameters
+  bool GroupParamsReused =
+      KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
+
+  if (!GroupParamsReused) {
+    if (auto Err =
+            getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv))
+      return Err;
+    KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+       GroupSizes[1], GroupSizes[2]);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+       KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
+       KEnv.GroupCounts.groupCountZ);
+
+  if (!GroupParamsReused) {
+    CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
+                      GroupSizes[1], GroupSizes[2]);
+  }
+
+  return Plugin::success();
+}
+
+Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
+                                   L0LaunchEnvTy &KEnv) const {
+  // Set Kernel Indirect flags
+  ze_kernel_indirect_access_flags_t Flags = 0;
+  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+  if (KEnv.KernelPR.IndirectAccessFlags != Flags) {
+    // Combine with common access flags
+    const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
+    CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags);
+    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+    KEnv.KernelPR.IndirectAccessFlags = Flags;
   }
 
   return Plugin::success();
@@ -438,17 +620,11 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
   __tgt_async_info *AsyncInfo = AsyncInfoWrapper;
 
-  // Libomptarget can pass negative NumTeams and ThreadLimit now after
-  // introducing __tgt_target_kernel. This happens only when we have valid
-  // LoopDesc and the region is not a teams region.
-
   auto zeKernel = getZeKernel();
   auto DeviceId = l0Device.getDeviceId();
   int32_t NumArgs = KernelArgs.NumArgs;
   int32_t NumTeams = KernelArgs.NumTeams[0];
   int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
-  void *LoopDesc = nullptr;
-
   if (NumTeams < 0)
     NumTeams = 0;
   if (ThreadLimit < 0)
@@ -471,36 +647,16 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   // We need to get a non-const version of the Properties structure in order to
   // use its lock and be able to cache the group params and indirect flags
   auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
-  // Protect from kernel preparation to submission as kernels are shared.
-  std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
 
-  // Decide group sizes and counts
-  uint32_t GroupSizes[3];
-  ze_group_count_t GroupCounts;
-
-  bool AllowCooperative = false;
+  L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
 
-  // Check if we can reuse previous group parameters
-  bool GroupParamsReused = KernelPR.reuseGroupParams(
-      static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
-      GroupSizes, GroupCounts, AllowCooperative);
+  // Protect from kernel preparation to submission as kernels are shared.
+  KernelPR.Mtx.lock();
 
-  if (!GroupParamsReused) {
-    if (auto Err = getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes,
-                                  GroupCounts, LoopDesc, AllowCooperative))
-      return Err;
-    KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
-                              NumTeams, ThreadLimit, GroupSizes, GroupCounts,
-                              AllowCooperative);
-  }
+  if (auto Err = setKernelGroups(l0Device, KEnv, NumTeams, ThreadLimit))
+    return Err;
 
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
-       GroupSizes[1], GroupSizes[2]);
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
-       GroupCounts.groupCountX, GroupCounts.groupCountY,
-       GroupCounts.groupCountZ);
+  // Set kernel arguments
   for (int32_t I = 0; I < NumArgs; I++) {
     {
       void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
@@ -513,109 +669,16 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
     }
   }
 
-  // Set Kernel Indirect flags
-  auto &PrevFlags = KernelPR.IndirectAccessFlags;
-  ze_kernel_indirect_access_flags_t Flags = 0;
-  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
-  Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
-
-  if (PrevFlags != Flags) {
-    // Combine with common access flags
-    const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
-    CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
-    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
-    PrevFlags = Flags;
-  }
-
-  if (!GroupParamsReused) {
-    CALL_ZE_RET_ERROR(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
-                      GroupSizes[1], GroupSizes[2]);
-  }
+  if (auto Err = setIndirectFlags(l0Device, KEnv))
+    return Err;
 
-  ze_command_list_handle_t CmdList = nullptr;
-  ze_command_queue_handle_t CmdQueue = nullptr;
+  // The next calls should unlock the KernelLock internally
   const bool UseImmCmdList = l0Device.useImmForCompute();
+  if (UseImmCmdList)
+    return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
+                                      Options.CommandMode);
 
-  if (UseImmCmdList) {
-    auto CmdListOrErr = l0Device.getImmCmdList();
-    if (!CmdListOrErr)
-      return CmdListOrErr.takeError();
-    CmdList = *CmdListOrErr;
-    // Command queue is not used with immediate command list
-  } else {
-    auto CmdListOrErr = l0Device.getCmdList();
-    if (!CmdListOrErr)
-      return CmdListOrErr.takeError();
-    CmdList = *CmdListOrErr;
-    auto CmdQueueOrErr = l0Device.getCmdQueue();
-    if (!CmdQueueOrErr)
-      return CmdQueueOrErr.takeError();
-    CmdQueue = *CmdQueueOrErr;
-  }
-
-  if (UseImmCmdList) {
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Using immediate command list for kernel submission.\n");
-    auto EventOrError = l0Device.getEvent();
-    if (!EventOrError)
-      return EventOrError.takeError();
-    ze_event_handle_t Event = *EventOrError;
-    size_t NumWaitEvents = 0;
-    ze_event_handle_t *WaitEvents = nullptr;
-    if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
-      if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
-        NumWaitEvents = 1;
-        WaitEvents = &AsyncQueue->WaitEvents.back();
-      } else {
-        NumWaitEvents = AsyncQueue->WaitEvents.size();
-        WaitEvents = AsyncQueue->WaitEvents.data();
-      }
-    }
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Kernel depends on %zu data copying events.\n", NumWaitEvents);
-    if (AllowCooperative)
-      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                        zeKernel, &GroupCounts, Event, NumWaitEvents,
-                        WaitEvents);
-    else
-      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                        &GroupCounts, Event, NumWaitEvents, WaitEvents);
-    KernelLock.unlock();
-    if (IsAsync) {
-      AsyncQueue->WaitEvents.push_back(Event);
-      AsyncQueue->KernelEvent = Event;
-    } else {
-      CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
-      if (auto Err = l0Device.releaseEvent(Event))
-        return Err;
-    }
-  } else {
-    ze_event_handle_t Event = nullptr;
-    if (AllowCooperative)
-      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                        zeKernel, &GroupCounts, Event, 0, nullptr);
-    else
-      CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                        &GroupCounts, Event, 0, nullptr);
-    KernelLock.unlock();
-    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
-    CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists,
-                          l0Device.getMutex(), CmdQueue, 1, &CmdList, nullptr);
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
-    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
-    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
-    if (Event) {
-      if (auto Err = l0Device.releaseEvent(Event))
-        return Err;
-    }
-  }
-
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
-       IdStr);
-
-  return Plugin::success();
+  return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
 }
 
 } // namespace llvm::omp::target::plugin

>From ba0ac45dc7a4f4a78b22ef2e698dfba01601a212 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 22:55:21 +0100
Subject: [PATCH 68/70] Remove unused code

---
 .../level_zero/include/L0Kernel.h             |  10 -
 .../level_zero/src/L0Kernel.cpp               | 259 +-----------------
 2 files changed, 5 insertions(+), 264 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 48cb4f7a6de40..b0c59be339c3a 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -59,7 +59,6 @@ struct KernelPropertiesTy {
   uint32_t MaxThreadGroupSize = 0;
 
   /// Cached input parameters used in the previous launch
-  TgtNDRangeDescTy LoopDesc;
   int32_t NumTeams = -1;
   int32_t ThreadLimit = -1;
 
@@ -71,8 +70,6 @@ struct KernelPropertiesTy {
 
   std::mutex Mtx;
 
-  static constexpr TgtNDRangeDescTy LoopDescInit = {};
-
   /// Check if we can reuse group parameters.
   bool reuseGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
                         uint32_t *GroupSizesOut, L0LaunchEnvTy &KEnv) const;
@@ -90,9 +87,6 @@ struct L0LaunchEnvTy {
   bool HalfNumThreads = false;
   bool IsTeamsNDRange = false;
 
-  bool AllowCooperative = false;
-  TgtNDRangeDescTy *LoopDesc = nullptr;
-
   L0LaunchEnvTy(bool IsAsync, AsyncQueueTy *AsyncQueue,
                 KernelPropertiesTy &KernelPR)
       : IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR) {}
@@ -109,10 +103,6 @@ class L0KernelTy : public GenericKernelTy {
                                   uint32_t ThreadLimit, uint32_t *GroupSizes,
                                   L0LaunchEnvTy &KEnv) const;
 
-  Error decideLoopKernelGroupArguments(L0DeviceTy &Device, uint32_t ThreadLimit,
-                                       uint32_t *GroupSizes,
-                                       L0LaunchEnvTy &KEnv) const;
-
   Error buildKernel(L0ProgramTy &Program);
 
   Error setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index c2227f06d3490..f0d32b78b4732 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -21,16 +21,11 @@ bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
                                           const int32_t ThreadLimitIn,
                                           uint32_t *GroupSizesOut,
                                           L0LaunchEnvTy &KEnv) const {
-  if (!KEnv.LoopDesc && LoopDescInit != LoopDesc)
-    return false;
-  if (KEnv.LoopDesc && *KEnv.LoopDesc != LoopDesc)
-    return false;
   if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
     return false;
   // Found matching input parameters.
   std::copy_n(GroupSizes, 3, GroupSizesOut);
   KEnv.GroupCounts = GroupCounts;
-  KEnv.AllowCooperative = AllowCooperative;
   return true;
 }
 
@@ -38,12 +33,10 @@ void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
                                           const int32_t ThreadLimitIn,
                                           const uint32_t *GroupSizesIn,
                                           L0LaunchEnvTy &KEnv) {
-  LoopDesc = KEnv.LoopDesc ? *KEnv.LoopDesc : LoopDescInit;
   NumTeams = NumTeamsIn;
   ThreadLimit = ThreadLimitIn;
   std::copy_n(GroupSizesIn, 3, GroupSizes);
   GroupCounts = KEnv.GroupCounts;
-  AllowCooperative = KEnv.AllowCooperative;
 }
 
 Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
@@ -148,51 +141,8 @@ void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
 
   uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
-  bool UsedReductionSubscriptionRate = false;
   if (!MaxGroupCountForced) {
-    {
-      GRPCounts[0] *= OptSubscRate;
-    }
-
-    size_t LoopTripcount = 0;
-    TgtNDRangeDescTy *LoopLevels = KEnv.LoopDesc;
-    if (LoopLevels) {
-      // TODO: consider other possible LoopDesc uses
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "Loop desciptor provided but specific ND-range is disabled\n");
-      // TODO: get rid of this constraint
-      if (LoopLevels->NumLoops > 1) {
-        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-             "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
-             LoopLevels->NumLoops);
-      } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
-        LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
-                         LoopLevels->Levels[0].Stride) /
-                        LoopLevels->Levels[0].Stride;
-        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-             "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
-             " = %zu\n",
-             LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
-             LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
-             LoopTripcount);
-      }
-    }
-
-    if (LoopTripcount && !UsedReductionSubscriptionRate) {
-      const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
-                                     Device.getNumSubslices() * SIMDWidth;
-      size_t AdjustedGroupCount =
-          KEnv.IsTeamsNDRange
-              ? (std::min)(((LoopTripcount + 7) & ~7),
-                           MaxTotalThreads / GRPSizes[0])
-              : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
-      AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
-      AdjustedGroupCount *= OptSubscRate;
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "Adjusting number of teams using the loop tripcount\n");
-      if (AdjustedGroupCount < GRPCounts[0])
-        GRPCounts[0] = AdjustedGroupCount;
-    }
+    GRPCounts[0] *= OptSubscRate;
   }
   GroupCounts.groupCountX = GRPCounts[0];
   GroupCounts.groupCountY = GRPCounts[1];
@@ -200,195 +150,6 @@ void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
   std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
 }
 
-// Return the number of total HW threads required to execute
-// a loop kernel compiled with the given SIMDWidth, and the given
-// loop(s) trip counts and group sizes.
-// Returns UINT64_MAX, if computations overflow.
-static uint64_t computeThreadsNeeded(const llvm::ArrayRef<size_t> TripCounts,
-                                     const llvm::ArrayRef<uint32_t> GroupSizes,
-                                     uint32_t SIMDWidth) {
-  assert(TripCounts.size() == 3 && "Invalid trip counts array size");
-  assert(GroupSizes.size() == 3 && "Invalid group sizes array size");
-  // Compute the number of groups in each dimension.
-  std::array<uint64_t, 3> GroupCount;
-
-  for (int I = 0; I < 3; ++I) {
-    if (TripCounts[I] == 0 || GroupSizes[I] == 0)
-      return (std::numeric_limits<uint64_t>::max)();
-    GroupCount[I] =
-        (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
-    if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
-      return (std::numeric_limits<uint64_t>::max)();
-  }
-  for (int I = 1; I < 3; ++I) {
-    if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
-      return (std::numeric_limits<uint64_t>::max)();
-    GroupCount[0] *= GroupCount[I];
-  }
-  // Multiplication of the group sizes must never overflow uint64_t
-  // for any existing device.
-  uint64_t LocalWorkSize =
-      uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
-  uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
-
-  // Check that the total number of threads fits uint64_t.
-  if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
-    return (std::numeric_limits<uint64_t>::max)();
-
-  return GroupCount[0] * ThreadsPerWG;
-}
-
-Error L0KernelTy::decideLoopKernelGroupArguments(L0DeviceTy &Device,
-                                                 uint32_t ThreadLimit,
-                                                 uint32_t *GroupSizes,
-                                                 L0LaunchEnvTy &KEnv) const {
-
-  const auto DeviceId = Device.getDeviceId();
-  const auto &Options = LevelZeroPluginTy::getOptions();
-  const auto &KernelPR = getProperties();
-  uint32_t MaxGroupSize = Device.getMaxGroupSize();
-  TgtNDRangeDescTy *LoopLevels = KEnv.LoopDesc;
-  auto &GroupCounts = KEnv.GroupCounts;
-
-  bool MaxGroupSizeForced = false;
-  if (ThreadLimit > 0) {
-    MaxGroupSizeForced = true;
-    MaxGroupSize = ThreadLimit;
-  }
-
-  uint32_t GRPCounts[3] = {1, 1, 1};
-  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
-  TgtLoopDescTy *Levels = LoopLevels->Levels;
-  int32_t DistributeDim = LoopLevels->DistributeDim;
-  assert(DistributeDim >= 0 && DistributeDim <= 2 &&
-         "Invalid distribute dimension.");
-  int32_t NumLoops = LoopLevels->NumLoops;
-  assert((NumLoops > 0 && NumLoops <= 3) &&
-         "Invalid loop nest description for ND partitioning");
-
-  // Compute global widths for X/Y/Z dimensions.
-  size_t TripCounts[3] = {1, 1, 1};
-
-  for (int32_t I = 0; I < NumLoops; I++) {
-    assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
-         ", Stride = %" PRId64 "\n",
-         I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
-    if (Levels[I].Ub < Levels[I].Lb)
-      TripCounts[I] = 0;
-    else
-      TripCounts[I] =
-          (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
-  }
-
-  // Check if any of the loop has zero iterations.
-  if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
-    std::fill(GroupSizes, GroupSizes + 3, 1);
-    std::fill(GRPCounts, GRPCounts + 3, 1);
-    if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
-      // There is a distribute dimension, and the distribute loop
-      // has non-zero iterations, but some inner parallel loop
-      // has zero iterations. We still want to split the distribute
-      // loop's iterations between many WGs (of size 1), but the inner/lower
-      // dimensions should be 1x1.
-      // Note that this code is currently dead, because we are not
-      // hoisting the inner loops' bounds outside of the target regions.
-      // The code is here just for completeness.
-      size_t DistributeTripCount = TripCounts[DistributeDim];
-      if (DistributeTripCount > UINT32_MAX) {
-        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-             "Invalid number of teams %zu due to large loop trip count\n",
-             DistributeTripCount);
-        return Plugin::success();
-      }
-      GRPCounts[DistributeDim] = DistributeTripCount;
-    }
-    KEnv.AllowCooperative = false;
-    GroupCounts.groupCountX = GRPCounts[0];
-    GroupCounts.groupCountY = GRPCounts[1];
-    GroupCounts.groupCountZ = GRPCounts[2];
-    return Plugin::success();
-  }
-
-  if (!MaxGroupSizeForced) {
-    // Use zeKernelSuggestGroupSize to compute group sizes,
-    // or fallback to setting dimension 0 width to SIMDWidth.
-    // Note that in case of user-specified LWS GRPSizes[0]
-    // is already set according to the specified value.
-    size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
-    if (DistributeDim > 0) {
-      // There is a distribute dimension.
-      GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
-      GlobalSizes[DistributeDim] = 1;
-    }
-
-    {
-      if (MaxGroupSize > KernelPR.Width) {
-        GRPSizes[0] = KernelPR.Width;
-      }
-      if (DistributeDim == 0) {
-        // If there is a distribute dimension, then we do not use
-        // thin HW threads, since we do not know anything about
-        // the iteration space of the inner parallel loop regions.
-        //
-        // If there is no distribute dimension, then try to use thiner
-        // HW threads to get more independent HW threads executing
-        // the kernel - this may allow more parallelism due to
-        // the stalls being distributed across multiple HW threads rather
-        // than across SIMD lanes within one HW thread.
-        assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
-               "Unexpected team sizes for dimensions 1 or/and 2.");
-        uint32_t SimdWidth = KernelPR.SIMDWidth;
-        uint64_t TotalThreads = Device.getTotalThreads();
-        TotalThreads *= Options.ThinThreadsThreshold;
-
-        uint32_t GRPSizePrev = GRPSizes[0];
-        uint64_t ThreadsNeeded =
-            computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
-        while (ThreadsNeeded < TotalThreads) {
-          GRPSizePrev = GRPSizes[0];
-          // Try to half the local work size (if possible) and see
-          // how many HW threads the kernel will require with this
-          // new local work size.
-          // In most implementations the initial GRPSizes[0]
-          // will be a power-of-two.
-          if (GRPSizes[0] <= 1)
-            break;
-          GRPSizes[0] >>= 1;
-          ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
-        }
-        GRPSizes[0] = GRPSizePrev;
-      }
-    }
-  }
-
-  for (int32_t I = 0; I < NumLoops; I++) {
-    if (I < DistributeDim) {
-      GRPCounts[I] = 1;
-      continue;
-    }
-    size_t Trip = TripCounts[I];
-    if (GRPSizes[I] >= Trip)
-      GRPSizes[I] = Trip;
-    size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
-    if (Count > UINT32_MAX) {
-      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                           "Invalid number of teams %zu due to large loop "
-                           "trip count\n",
-                           Count);
-    }
-    GRPCounts[I] = (uint32_t)Count;
-  }
-  KEnv.AllowCooperative = false;
-  GroupCounts.groupCountX = GRPCounts[0];
-  GroupCounts.groupCountY = GRPCounts[1];
-  GroupCounts.groupCountZ = GRPCounts[2];
-  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
-
-  return Plugin::success();
-}
-
 Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                                  int32_t ThreadLimit, uint32_t *GroupSizes,
                                  L0LaunchEnvTy &KEnv) const {
@@ -454,7 +215,6 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
 
     decideKernelGroupArguments(Device, (uint32_t)NumTeams,
                                (uint32_t)ThreadLimit, GroupSizes, KEnv);
-    KEnv.AllowCooperative = false;
   }
 
   return Plugin::success();
@@ -492,13 +252,8 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Kernel depends on %zu data copying events.\n", NumWaitEvents);
-  if (KEnv.AllowCooperative)
-    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                      zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
-                      WaitEvents);
-  else
-    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                      &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
+  CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                    &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
   KEnv.KernelPR.Mtx.unlock();
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
@@ -537,12 +292,8 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
        "Using regular command list for kernel submission.\n");
 
   ze_event_handle_t Event = nullptr;
-  if (KEnv.AllowCooperative)
-    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchCooperativeKernel, CmdList,
-                      zeKernel, &KEnv.GroupCounts, Event, 0, nullptr);
-  else
-    CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
-                      &KEnv.GroupCounts, Event, 0, nullptr);
+  CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                    &KEnv.GroupCounts, Event, 0, nullptr);
   KEnv.KernelPR.Mtx.unlock();
   CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
   CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),

>From 32b9b59d76c2ba22489d1aed2e5786cf5c7afb6c Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 23:38:27 +0100
Subject: [PATCH 69/70] Use correct thread information

---
 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index f0d32b78b4732..bbf101efdf4c7 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -374,8 +374,8 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   auto zeKernel = getZeKernel();
   auto DeviceId = l0Device.getDeviceId();
   int32_t NumArgs = KernelArgs.NumArgs;
-  int32_t NumTeams = KernelArgs.NumTeams[0];
-  int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+  int32_t NumTeams = NumThreads[0];
+  int32_t ThreadLimit = NumBlocks[0];
   if (NumTeams < 0)
     NumTeams = 0;
   if (ThreadLimit < 0)

>From b44f992caf54c2eca98999dc65289bab58a3bfaf Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Mon, 3 Nov 2025 23:50:53 +0100
Subject: [PATCH 70/70] format

---
 .../plugins-nextgen/level_zero/src/L0Memory.cpp    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
index 5ad371a92db53..65637fe29af4b 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -330,17 +330,19 @@ size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
   return Deallocated;
 }
 
-void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t ReqSize,
-                                            size_t AllocSize, int32_t Kind, bool InPool,
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base,
+                                            size_t ReqSize, size_t AllocSize,
+                                            int32_t Kind, bool InPool,
                                             bool ImplicitArg) {
-  const auto Inserted =
-      Map.emplace(Ptr, MemAllocInfoTy{Base, ReqSize, AllocSize, Kind, InPool, ImplicitArg});
+  const auto Inserted = Map.emplace(
+      Ptr, MemAllocInfoTy{Base, ReqSize, AllocSize, Kind, InPool, ImplicitArg});
   // Check if we keep valid disjoint memory ranges.
   [[maybe_unused]] bool Valid = Inserted.second;
   if (Valid) {
     if (Inserted.first != Map.begin()) {
       const auto I = std::prev(Inserted.first, 1);
-      Valid = Valid && (uintptr_t)I->first + I->second.ReqSize <= (uintptr_t)Ptr;
+      Valid =
+          Valid && (uintptr_t)I->first + I->second.ReqSize <= (uintptr_t)Ptr;
     }
     if (Valid) {
       const auto I = std::next(Inserted.first, 1);
@@ -668,7 +670,7 @@ Expected<void *> MemAllocatorTy::allocFromL0(size_t Size, size_t Align,
   return Mem;
 }
 
-Error MemAllocatorTy::deallocFromL0 (void *Ptr) {
+Error MemAllocatorTy::deallocFromL0(void *Ptr) {
   CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Ptr);
   DP("Freed device pointer " DPxMOD "\n", DPxPTR(Ptr));
   return Plugin::success();



More information about the llvm-commits mailing list