[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)

Alex Duran via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 18 10:02:17 PDT 2025


https://github.com/adurang updated https://github.com/llvm/llvm-project/pull/158900

>From 0c427647d9ce0de9506992dfb16074178bebcc19 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 11:46:48 +0200
Subject: [PATCH 01/13] [OFFLOAD] Add plugin with support for Intel Level Zero

---
 offload/CMakeLists.txt                        |   17 +-
 .../Modules/LibomptargetGetDependencies.cmake |   21 +
 offload/include/OpenMP/InteropAPI.h           |    7 +-
 offload/include/PerThreadTable.h              |  155 ++-
 .../plugins-nextgen/common/include/DLWrap.h   |   16 +
 .../plugins-nextgen/level_zero/CMakeLists.txt |   69 ++
 .../level_zero/include/AsyncQueue.h           |   50 +
 .../level_zero/include/L0Context.h            |  138 +++
 .../level_zero/include/L0Defs.h               |   73 ++
 .../level_zero/include/L0Device.h             |  680 +++++++++++
 .../level_zero/include/L0Interop.h            |   25 +
 .../level_zero/include/L0Kernel.h             |  154 +++
 .../level_zero/include/L0Memory.h             |  574 +++++++++
 .../level_zero/include/L0Options.h            |  189 +++
 .../level_zero/include/L0Plugin.h             |  136 +++
 .../level_zero/include/L0Program.h            |  135 +++
 .../level_zero/include/L0Trace.h              |  193 +++
 .../plugins-nextgen/level_zero/include/TLS.h  |   86 ++
 .../level_zero/src/L0Context.cpp              |   41 +
 .../level_zero/src/L0Device.cpp               | 1065 +++++++++++++++++
 .../level_zero/src/L0DynWrapper.cpp           |  134 +++
 .../level_zero/src/L0Kernel.cpp               |  649 ++++++++++
 .../level_zero/src/L0Memory.cpp               |  637 ++++++++++
 .../level_zero/src/L0Options.cpp              |  371 ++++++
 .../level_zero/src/L0Plugin.cpp               |  285 +++++
 .../level_zero/src/L0Program.cpp              |  625 ++++++++++
 .../level_zero/src/OmpWrapper.cpp             |   71 ++
 27 files changed, 6586 insertions(+), 10 deletions(-)
 create mode 100644 offload/plugins-nextgen/level_zero/CMakeLists.txt
 create mode 100644 offload/plugins-nextgen/level_zero/include/AsyncQueue.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Context.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Defs.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Device.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Interop.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Kernel.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Memory.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Options.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Plugin.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Program.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Trace.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/TLS.h
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Context.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Device.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Memory.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Options.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Program.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index b277380783500..8a704ab05eb53 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
   message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
 endif()
 
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
 set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
-    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
 
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
@@ -176,6 +176,19 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
   endif()
 endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
+        CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
+  if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building Level Zero plugin: it is only supported on "
+	           "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+  endif()
+endif()
+if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
+		NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+    message(STATUS "Not building Level Zero plugin: dependencies not found")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 2a8bdebf2c1dd..0af0ae1ecdbec 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -89,4 +89,25 @@ if(LIBOMPTARGET_AMDGPU_ARCH)
   endif()
 endif()
 
+################################################################################
+# Looking for Level0
+################################################################################
+message(STATUS "Looking for Level0 includes.")
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
+
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
+	set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+  message(STATUS "Could NOT find Level Zero. Missing includes.")
+else()
+  message(STATUS "Level Zero include DIR: ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}")
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
+  message(STATUS "Looking for Level Zero library.")
+  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
+  if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES)
+    message(STATUS "Could NOT find Level Zero. Missing library.")
+  else()
+	  message(STATUS "Level Zero library: ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES}")
+  endif()
+endif()
+
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..2553bfa930784 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -160,17 +160,12 @@ struct InteropTableEntry {
     Interops.push_back(obj);
   }
 
-  template <class ClearFuncTy> void clear(ClearFuncTy f) {
-    for (auto &Obj : Interops) {
-      f(Obj);
-    }
-  }
-
   /// vector interface
   int size() const { return Interops.size(); }
   iterator begin() { return Interops.begin(); }
   iterator end() { return Interops.end(); }
   iterator erase(iterator it) { return Interops.erase(it); }
+  void clear() { Interops.clear(); }
 };
 
 struct InteropTblTy
diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 45b196171b4c8..0241370953c67 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -16,6 +16,60 @@
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
+
+template <typename ObjectType> struct PerThread {
+  struct PerThreadData {
+    std::unique_ptr<ObjectType> ThEntry;
+  };
+
+  std::mutex Mtx;
+  std::list<std::shared_ptr<PerThreadData>> ThreadDataList;
+
+  // define default constructors, disable copy and move constructors
+  PerThread() = default;
+  PerThread(const PerThread &) = delete;
+  PerThread(PerThread &&) = delete;
+  PerThread &operator=(const PerThread &) = delete;
+  PerThread &operator=(PerThread &&) = delete;
+  ~PerThread() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ThreadDataList.clear();
+  }
+
+private:
+  PerThreadData &getThreadData() {
+    static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
+    if (!ThData) {
+      ThData = std::make_shared<PerThreadData>();
+      std::lock_guard<std::mutex> Lock(Mtx);
+      ThreadDataList.push_back(ThData);
+    }
+    return *ThData;
+  }
+
+protected:
+  ObjectType &getThreadEntry() {
+    auto &ThData = getThreadData();
+    if (ThData.ThEntry)
+      return *ThData.ThEntry;
+    ThData.ThEntry = std::make_unique<ObjectType>();
+    return *ThData.ThEntry;
+  }
+
+public:
+  ObjectType &get() { return getThreadEntry(); }
+
+  template <class F> void clear(F f) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto ThData : ThreadDataList) {
+      if (!ThData->ThEntry)
+        continue;
+      f(*ThData->ThEntry);
+    }
+    ThreadDataList.clear();
+  }
+};
 
 // Using an STL container (such as std::vector) indexed by thread ID has
 // too many race conditions issues so we store each thread entry into a
@@ -23,10 +77,32 @@
 // T is the container type used to store the objects, e.g., std::vector,
 // std::set, etc. by each thread. O is the type of the stored objects e.g.,
 // omp_interop_val_t *, ...
-
 template <typename ContainerType, typename ObjectType> struct PerThreadTable {
   using iterator = typename ContainerType::iterator;
 
+  template <typename, typename = std::void_t<>>
+  struct has_iterator : std::false_type {};
+  template <typename T>
+  struct has_iterator<T, std::void_t<typename T::iterator>> : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clear : std::false_type {};
+  template <typename T>
+  struct has_clear<T, std::void_t<decltype(std::declval<T>().clear())>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clearAll : std::false_type {};
+  template <typename T>
+  struct has_clearAll<T, std::void_t<decltype(std::declval<T>().clearAll(1))>>
+      : std::true_type {};
+
+  template <typename, typename = std::void_t<>>
+  struct is_associative : std::false_type {};
+  template <typename T>
+  struct is_associative<T, std::void_t<typename T::mapped_type>>
+      : std::true_type {};
+
   struct PerThreadData {
     size_t NElements = 0;
     std::unique_ptr<ContainerType> ThEntry;
@@ -71,6 +147,11 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     return ThData.NElements;
   }
 
+  void setNElements(size_t Size) {
+    auto &NElements = getThreadNElements();
+    NElements = Size;
+  }
+
 public:
   void add(ObjectType obj) {
     auto &Entry = getThreadEntry();
@@ -104,11 +185,81 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     for (auto ThData : ThreadDataList) {
       if (!ThData->ThEntry || ThData->NElements == 0)
         continue;
-      ThData->ThEntry->clear(f);
+      if constexpr (has_clearAll<ContainerType>::value) {
+        ThData->ThEntry->clearAll(f);
+      } else if constexpr (has_iterator<ContainerType>::value &&
+                           has_clear<ContainerType>::value) {
+        for (auto &Obj : *ThData->ThEntry) {
+          if constexpr (is_associative<ContainerType>::value) {
+            f(Obj.second);
+          } else {
+            f(Obj);
+          }
+        }
+        ThData->ThEntry->clear();
+      } else {
+        static_assert(true, "Container type not supported");
+      }
       ThData->NElements = 0;
     }
     ThreadDataList.clear();
   }
 };
 
+template <typename T, typename = std::void_t<>> struct ContainerValueType {
+  using type = typename T::value_type;
+};
+template <typename T>
+struct ContainerValueType<T, std::void_t<typename T::mapped_type>> {
+  using type = typename T::mapped_type;
+};
+
+template <typename ContainerType, size_t reserveSize = 0>
+struct PerThreadContainer
+    : public PerThreadTable<ContainerType,
+                            typename ContainerValueType<ContainerType>::type> {
+
+  // helpers
+  template <typename T, typename = std::void_t<>> struct indexType {
+    using type = typename T::size_type;
+  };
+  template <typename T> struct indexType<T, std::void_t<typename T::key_type>> {
+    using type = typename T::key_type;
+  };
+  template <typename T, typename = std::void_t<>>
+  struct has_resize : std::false_type {};
+  template <typename T>
+  struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(1))>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_reserve : std::false_type {};
+  template <typename T>
+  struct has_reserve<T, std::void_t<decltype(std::declval<T>().reserve(1))>>
+      : std::true_type {};
+
+  using IndexType = typename indexType<ContainerType>::type;
+  using ObjectType = typename ContainerValueType<ContainerType>::type;
+
+  // Get the object for the given index in the current thread
+  ObjectType &get(IndexType Index) {
+    auto &Entry = this->getThreadEntry();
+
+    // specialized code for vector-like containers
+    if constexpr (has_resize<ContainerType>::value) {
+      if (Index >= Entry.size()) {
+        if constexpr (has_reserve<ContainerType>::value && reserveSize > 0) {
+          if (Entry.capacity() < reserveSize)
+            Entry.reserve(reserveSize);
+        }
+        // If the index is out of bounds, try resize the container
+        Entry.resize(Index + 1);
+      }
+    }
+    ObjectType &Ret = Entry[Index];
+    this->setNElements(Entry.size());
+    return Ret;
+  }
+};
+
 #endif
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
index 8934e7e701021..95ce86e123cd3 100644
--- a/offload/plugins-nextgen/common/include/DLWrap.h
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -282,5 +282,21 @@ template <size_t Requested, size_t Required> constexpr void verboseAssert() {
     return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
                                           x9, x10);                            \
   }
+#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9,                  \
+                        typename T::template arg<10>::type x10,                \
+                        typename T::template arg<11>::type x11) {              \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9, x10, x11);                       \
+  }
 
 #endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000000..b9c8dd423c3ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -0,0 +1,69 @@
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+return()
+endif()
+
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
+
+set(LEVEL_ZERO_SRC_FILES
+        src/L0Context.cpp
+        src/L0Device.cpp
+        src/L0Kernel.cpp
+        src/L0Memory.cpp
+        src/L0Program.cpp
+        src/L0Plugin.cpp
+        src/L0Program.cpp
+        src/L0Options.cpp
+)
+list(APPEND LEVEL_ZERO_SRC_FILES
+        src/OmpWrapper.cpp
+)
+
+target_sources(omptarget.rtl.level_zero PRIVATE
+   ${LEVEL_ZERO_SRC_FILES}
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/include
+      ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+      ${LIBOMPTARGET_OMP_HEADER_DIR}
+)
+
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  # Full path to the L0 library is recognized as a linker option, so we
+  # separate directory and file name
+  get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+  get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+  target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
+  target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+  libomptarget_add_resource_file(omptarget.rtl.level_zero)
+else()
+   message(FATAL_ERROR "Missing platfrom support")
+endif()
+
+else()
+message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # Windows uses dll instead of lib files at runtime
+   string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
+endif()
+target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+endif()
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
new file mode 100644
index 0000000000000..105f68205e402
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -0,0 +1,50 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Async Queue wrapper for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <vector>
+
+#include "L0Memory.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Abstract queue that supports asynchronous command submission
+struct AsyncQueueTy {
+  /// List of events attahced to submitted commands
+  std::vector<ze_event_handle_t> WaitEvents;
+  /// Pending staging buffer to host copies
+  std::list<std::tuple<void *, void *, size_t>> H2MList;
+  /// Pending USM memory copy commands that must wait for kernel completion
+  std::list<std::tuple<const void *, void *, size_t>> USM2MList;
+  /// Kernel event not signaled
+  ze_event_handle_t KernelEvent = nullptr;
+  /// Is this queue being used currently
+  bool InUse = false;
+  /// Clear data
+  void reset() {
+    WaitEvents.clear();
+    H2MList.clear();
+    USM2MList.clear();
+    KernelEvent = nullptr;
+  }
+};
+
+typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
new file mode 100644
index 0000000000000..b2b6def8101ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -0,0 +1,138 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Memory.h"
+#include "PerThreadTable.h"
+
+namespace llvm::omp::target::plugin {
+
+class LevelZeroPluginTy;
+
+class L0ContextTLSTy {
+  StagingBufferTy StagingBuffer;
+
+public:
+  auto &getStagingBuffer() { return StagingBuffer; }
+  const auto &getStagingBuffer() const { return StagingBuffer; }
+
+  void clear() { StagingBuffer.clear(); }
+};
+
+struct L0ContextTLSTableTy
+    : public PerThreadContainer<
+          std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
+  void clear() {
+    PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+/// Driver and context-specific resources. We assume a single context per
+/// driver.
+class L0ContextTy {
+  /// The plugin that created this context
+  LevelZeroPluginTy &Plugin;
+
+  /// Level Zero Driver handle
+  ze_driver_handle_t zeDriver = nullptr;
+
+  /// Common Level Zero context
+  ze_context_handle_t zeContext = nullptr;
+
+  /// API version supported by the Level Zero driver
+  ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT;
+
+  /// Imported external pointers. Track this only for user-directed
+  /// imports/releases.
+  std::unordered_map<uintptr_t, size_t> ImportedPtrs;
+
+  /// Common event pool
+  EventPoolTy EventPool;
+
+  /// Host Memory allocator for this driver
+  MemAllocatorTy HostMemAllocator;
+
+public:
+  /// Named constants for checking the imported external pointer regions.
+  static constexpr int32_t ImportNotExist = -1;
+  static constexpr int32_t ImportUnknown = 0;
+  static constexpr int32_t ImportExist = 1;
+
+  /// Create context, initialize event pool and extension functions
+  L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+              int32_t DriverId);
+
+  L0ContextTy(const L0ContextTy &) = delete;
+  L0ContextTy(L0ContextTy &&) = delete;
+  L0ContextTy &operator=(const L0ContextTy &) = delete;
+  L0ContextTy &operator=(const L0ContextTy &&) = delete;
+
+  /// Release resources
+  ~L0ContextTy() {
+    EventPool.deinit();
+    HostMemAllocator.deinit();
+    if (zeContext)
+      CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+  }
+
+  auto &getPlugin() const { return Plugin; }
+
+  StagingBufferTy &getStagingBuffer();
+
+  /// Add imported external pointer region.
+  void addImported(void *Ptr, size_t Size) {
+    (void)ImportedPtrs.emplace((uintptr_t)Ptr, Size);
+  }
+
+  /// Remove imported external pointer region
+  void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); }
+
+  /// Check if imported regions contain the specified region.
+  int32_t checkImported(void *Ptr, size_t Size) const {
+    uintptr_t LB = (uintptr_t)Ptr;
+    uintptr_t UB = LB + Size;
+    // We do not expect a large number of user-directed imports, so use simple
+    // logic.
+    for (auto &I : ImportedPtrs) {
+      uintptr_t ILB = I.first;
+      uintptr_t IUB = ILB + I.second;
+      if (LB >= ILB && UB <= IUB)
+        return ImportExist;
+      if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB))
+        return ImportUnknown;
+    }
+    return ImportNotExist;
+  }
+
+  ze_driver_handle_t getZeDriver() const { return zeDriver; }
+
+  /// Return context associated with the driver
+  ze_context_handle_t getZeContext() const { return zeContext; }
+
+  /// Return driver API version
+  ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
+
+  /// Return the event pool of this driver
+  auto &getEventPool() { return EventPool; }
+  const auto &getEventPool() const { return EventPool; }
+
+  bool supportsLargeMem() const {
+    // Large memory support is available since API version 1.1
+    return getDriverAPIVersion() >= ZE_API_VERSION_1_1;
+  }
+
+  const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
+  MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
new file mode 100644
index 0000000000000..81566f52a2aea
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -0,0 +1,73 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// External and other auxilary definitions
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "PluginInterface.h"
+#include "Shared/Requirements.h"
+#include "omptarget.h"
+
+#define LIBOMP_DECL(RetType, FnDecl) RetType __cdecl FnDecl
+
+enum class AllocOptionTy : int32_t {
+  ALLOC_OPT_NONE = 0,
+  ALLOC_OPT_REDUCTION_SCRATCH = 1,
+  ALLOC_OPT_REDUCTION_COUNTER = 2,
+  ALLOC_OPT_HOST_MEM = 3,
+  ALLOC_OPT_SLM = 4,
+};
+
+/// Host runtime routines being used
+extern "C" {
+LIBOMP_DECL(int, omp_get_max_teams(void));
+LIBOMP_DECL(int, omp_get_thread_limit(void));
+LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
+LIBOMP_DECL(double, omp_get_wtime(void));
+} // extern "C"
+
+#ifndef EXTRACT_BITS
+// MSB=63, LSB=0
+#define EXTRACT_BITS(I64, HIGH, LOW)                                           \
+  (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
+#endif
+
+namespace llvm::omp::target::plugin {
+
+/// Default alignmnet for allocation
+constexpr size_t L0Alignment = 0;
+/// Default staging buffer size for host to device copy (16KB)
+constexpr size_t L0StagingBufferSize = (1 << 14);
+/// Default staging buffer count
+constexpr size_t L0StagingBufferCount = 64;
+/// USM allocation threshold where preallocation does not pay off (128MB)
+constexpr size_t L0UsmPreAllocThreshold = (128 << 20);
+/// Host USM allocation threshold where preallocation does not pay off (8MB)
+constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20);
+
+using namespace error;
+/// Generic L0 handle type
+using ZeHandleTy = void *;
+
+template <typename... ArgsTy>
+static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+
+  if (Code == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  const char *Desc = "Unknown error";
+  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+                                                    ErrFmt, Args..., Desc);
+}
+
+#define L0_UNIMPLEMENTED_ERR                                                   \
+  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet\n",   \
+                       __func__);
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
new file mode 100644
index 0000000000000..6acfa7e0ee67d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -0,0 +1,680 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+  None            = 0x0000,
+  SKL             = 0x1900,
+  KBL             = 0x5900,
+  CFL             = 0x3E00,
+  CFL_2           = 0x9B00,
+  ICX             = 0x8A00,
+  TGL             = 0xFF20,
+  TGL_2           = 0x9A00,
+  DG1             = 0x4900,
+  RKL             = 0x4C00,
+  ADLS            = 0x4600,
+  RTL             = 0xA700,
+  MTL             = 0x7D00,
+  PVC             = 0x0B00,
+  DG2_ATS_M       = 0x4F00,
+  DG2_ATS_M_2     = 0x5600,
+  LNL             = 0x6400,
+  BMG             = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+  DeviceArch_None   = 0,
+  DeviceArch_Gen    = 0x0001, // Gen 9, Gen 11 or Xe
+  DeviceArch_XeLPG  = 0x0002,
+  DeviceArch_XeHPC  = 0x0004,
+  DeviceArch_XeHPG  = 0x0008,
+  DeviceArch_Xe2LP  = 0x0010,
+  DeviceArch_Xe2HP  = 0x0020,
+  DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+  ze_device_handle_t zeId;
+  int32_t RootId;
+  int32_t SubId;
+  int32_t CCSId;
+
+  L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+               int32_t CCSId = -1)
+      : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+  /// Command list for each device
+  ze_command_list_handle_t CmdList = nullptr;
+
+  /// Main copy command list for each device
+  ze_command_list_handle_t CopyCmdList = nullptr;
+
+  /// Link copy command list for each device
+  ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+  /// Command queue for each device
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  /// Main copy command queue for each device
+  ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+  /// Link copy command queues for each device
+  ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+  /// Immediate command list for each device
+  ze_command_list_handle_t ImmCmdList = nullptr;
+
+  /// Immediate copy command list for each device
+  ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+  L0DeviceTLSTy() = default;
+  ~L0DeviceTLSTy() {
+    // assert all fields are nullptr on destruction
+    assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+    assert(CopyCmdList == nullptr &&
+           "CopyCmdList is not nullptr on destruction");
+    assert(LinkCopyCmdList == nullptr &&
+           "LinkCopyCmdList is not nullptr on destruction");
+    assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+    assert(CopyCmdQueue == nullptr &&
+           "CopyCmdQueue is not nullptr on destruction");
+    assert(LinkCopyCmdQueue == nullptr &&
+           "LinkCopyCmdQueue is not nullptr on destruction");
+    assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+    assert(ImmCopyCmdList == nullptr &&
+           "ImmCopyCmdList is not nullptr on destruction");
+  }
+
+  L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+    CmdList = std::exchange(Other.CmdList, nullptr);
+    CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+    LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+    CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+    CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+    LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+    ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+    ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+  }
+
+  void clear() {
+    // destroy all lists and queues
+    if (CmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+    if (CopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+    if (LinkCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+    if (ImmCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+    if (ImmCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+    if (CmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+    if (CopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+    if (LinkCopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+    CmdList = nullptr;
+    CopyCmdList = nullptr;
+    LinkCopyCmdList = nullptr;
+    CmdQueue = nullptr;
+    CopyCmdQueue = nullptr;
+    LinkCopyCmdQueue = nullptr;
+    ImmCmdList = nullptr;
+    ImmCopyCmdList = nullptr;
+  }
+
+  L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+  auto getCmdList() const { return CmdList; }
+  void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+  auto getCopyCmdList() const { return CopyCmdList; }
+  void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+    CopyCmdList = _CopyCmdList;
+  }
+
+  auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+  void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+    LinkCopyCmdList = _LinkCopyCmdList;
+  }
+
+  auto getImmCmdList() const { return ImmCmdList; }
+  void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+    ImmCmdList = _ImmCmdList;
+  }
+
+  auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+  void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+    ImmCopyCmdList = _ImmCopyCmdList;
+  }
+
+  auto getCmdQueue() const { return CmdQueue; }
+  void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+    CmdQueue = _CmdQueue;
+  }
+
+  auto getCopyCmdQueue() const { return CopyCmdQueue; }
+  void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+    CopyCmdQueue = _CopyCmdQueue;
+  }
+
+  auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+  void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+    LinkCopyCmdQueue = _LinkCopyCmdQueue;
+  }
+};
+
+struct L0DeviceTLSTableTy
+    : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+  void clear() {
+    PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+  // Level Zero Context for this Device
+  L0ContextTy &l0Context;
+
+  // Level Zero handle  for this Device
+  ze_device_handle_t zeDevice;
+  // Device Properties
+  ze_device_properties_t DeviceProperties{};
+  ze_device_compute_properties_t ComputeProperties{};
+  ze_device_memory_properties_t MemoryProperties{};
+  ze_device_cache_properties_t CacheProperties{};
+
+  /// Devices' default target allocation kind for internal allocation
+  int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+  DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+  std::string DeviceName;
+
+  /// Common indirect access flags for this device
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+  /// Device UUID for toplevel devices only
+  std::string DeviceUuid;
+
+  /// L0 Device ID as string
+  std::string zeId;
+
+  /// Command queue group ordinals for each device
+  std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals for copying
+  std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals and number of queues for link copy engines
+  std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+  /// Command queue index for each device
+  uint32_t ComputeIndex = 0;
+
+  bool IsAsyncEnabled = false;
+
+  // lock for this device
+  std::mutex Mutex;
+
+  /// Contains all modules (possibly from multiple device images) to handle
+  /// dynamic link across multiple images
+  llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+  /// L0 programs created for this device
+  std::list<L0ProgramTy> Programs;
+
+  /// MemAllocator for this device
+  MemAllocatorTy MemAllocator;
+
+  /// The current size of the global device memory pool (managed by us).
+  uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+  int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+  int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfo);
+  int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                       __tgt_async_info *AsyncInfo);
+
+  bool shouldSetupDeviceMemoryPool() const override { return false; }
+  DeviceArchTy computeArch() const;
+
+  /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+  /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+  Error internalInit();
+
+public:
+  L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+             ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+             const std::string &zeId, int32_t ComputeIndex)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+        l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+        ComputeIndex(ComputeIndex) {
+    DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    DeviceProperties.pNext = nullptr;
+    ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    ComputeProperties.pNext = nullptr;
+    MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+    MemoryProperties.pNext = nullptr;
+    CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+    CacheProperties.pNext = nullptr;
+
+    auto Err = internalInit();
+    if (Err) {
+      FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+                    toString(std::move(Err)).c_str());
+    }
+  }
+
+  static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+    return static_cast<L0DeviceTy &>(Device);
+  }
+  static L0DeviceTy &makeL0Device(ompt_device_t *Device) {
+    return *static_cast<L0DeviceTy *>(Device);
+  }
+
+  auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+  L0DeviceTLSTy &getTLS();
+
+  Error setContext() override { return Plugin::success(); }
+  Error initImpl(GenericPluginTy &Plugin) override;
+  Error deinitImpl() override {
+    Programs.clear();
+    return Plugin::success();
+  }
+
+  auto getZeDevice() const { return zeDevice; }
+
+  const L0ContextTy &getL0Context() const { return l0Context; }
+  L0ContextTy &getL0Context() { return l0Context; }
+
+  const std::string &getName() const { return DeviceName; }
+  const char *getNameCStr() const { return DeviceName.c_str(); }
+
+  const std::string &getZeId() const { return zeId; }
+  const char *getZeIdCStr() const { return zeId.c_str(); }
+
+  std::mutex &getMutex() { return Mutex; }
+
+  auto getComputeIndex() const { return ComputeIndex; }
+  auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+  auto getNumGlobalModules() const { return GlobalModules.size(); }
+  void addGlobalModule(ze_module_handle_t Module) {
+    GlobalModules.push_back(Module);
+  }
+  auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+  L0ProgramTy *getProgramFromImage(const __tgt_device_image *Image) {
+    for (auto &PGM : Programs)
+      if (PGM.getTgtImage() == Image)
+        return &PGM;
+    return nullptr;
+  }
+
+  int32_t buildAllKernels() {
+    for (auto &PGM : Programs) {
+      int32_t RC = PGM.loadModuleKernels();
+      if (RC != OFFLOAD_SUCCESS)
+        return RC;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  // add a new program to the device. Return a reference to the new program
+  auto &addProgram(int32_t ImageId, const __tgt_device_image *Image) {
+    Programs.emplace_back(ImageId, *this, Image);
+    return Programs.back();
+  }
+
+  const auto &getLastProgram() const { return Programs.back(); }
+  auto &getLastProgram() { return Programs.back(); }
+  // Device properties getters
+  auto getVendorId() const { return DeviceProperties.vendorId; }
+  bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+  auto getPCIId() const { return DeviceProperties.deviceId; }
+  auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+  auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+  auto getNumEUsPerSubslice() const {
+    return DeviceProperties.numEUsPerSubslice;
+  }
+  auto getNumSubslicesPerSlice() const {
+    return DeviceProperties.numSubslicesPerSlice;
+  }
+  auto getNumSlices() const { return DeviceProperties.numSlices; }
+  auto getNumSubslices() const {
+    return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+  }
+  uint32_t getNumEUs() const {
+    return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+  }
+  auto getTotalThreads() const {
+    return DeviceProperties.numThreadsPerEU * getNumEUs();
+  }
+  auto getNumThreadsPerSubslice() const {
+    return getNumEUsPerSubslice() * getNumThreadsPerEU();
+  }
+  auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+  auto getMaxSharedLocalMemory() const {
+    return ComputeProperties.maxSharedLocalMemory;
+  }
+  auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+  auto getCacheSize() const { return CacheProperties.cacheSize; }
+
+  int32_t getAllocKind() const { return AllocKind; }
+  DeviceArchTy getDeviceArch() const { return DeviceArch; }
+  bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+  static bool isDiscrete(uint32_t PCIId) {
+    switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+    case PCIIdTy::BMG:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  static bool isDiscrete(ze_device_handle_t Device) {
+    ze_device_properties_t PR{};
+    PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    PR.pNext = nullptr;
+    CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+    return isDiscrete(PR.deviceId);
+  }
+
+  bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+  bool isDeviceIPorNewer(uint32_t Version) const;
+
+  const std::string &getUuid() const { return DeviceUuid; }
+
+  uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+  uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+  bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+  uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+  uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+  uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+  bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+  bool deviceRequiresImmCmdList() const {
+    return isDeviceIPorNewer(0x05004000);
+  }
+  bool asyncEnabled() const { return IsAsyncEnabled; }
+  bool useImmForCompute() const { return true; }
+  bool useImmForCopy() const { return true; }
+  bool useImmForInterop() const { return true; }
+  bool forceInorderInterop() const { return true; }
+
+  void reportDeviceInfo() const;
+
+  // Command queues related functions
+  /// Create a command list with given ordinal and flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         ze_command_list_flags_t Flags,
+                                         const std::string &DeviceIdStr);
+
+  /// Create a command list with default flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         const std::string &DeviceIdStr);
+
+  ze_command_list_handle_t getCmdList();
+
+  /// Create a command queue with given ordinal and flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           ze_command_queue_flags_t Flags,
+                                           const std::string &DeviceIdStr);
+
+  /// Create a command queue with default flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           const std::string &DeviceIdStr,
+                                           bool InOrder = false);
+
+  /// Create a new command queue for the given OpenMP device ID
+  ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+  /// Create an immediate command list
+  ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+                                            bool InOrder = false);
+
+  /// Create an immediate command list for computing
+  ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+    return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+  }
+
+  /// Create an immediate command list for copying
+  ze_command_list_handle_t createImmCopyCmdList();
+  ze_command_queue_handle_t getCmdQueue();
+  ze_command_list_handle_t getCopyCmdList();
+  ze_command_queue_handle_t getCopyCmdQueue();
+  ze_command_list_handle_t getLinkCopyCmdList();
+  ze_command_queue_handle_t getLinkCopyCmdQueue();
+  ze_command_list_handle_t getImmCmdList();
+  ze_command_list_handle_t getImmCopyCmdList();
+
+  /// Enqueue copy command
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                         __tgt_async_info *AsyncInfo = nullptr,
+                         bool Locked = false, bool UseCopyEngine = true);
+
+  /// Enqueue asynchronous copy command
+  int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                              __tgt_async_info *AsyncInfo, bool CopyTo = true);
+
+  /// Enqueue fill command
+  int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+                         size_t Size);
+
+  /// Driver related functions
+
+  /// Reurn the driver handle for this device
+  ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); }
+
+  /// Return context for this device
+  ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); }
+
+  /// Return driver API version for this device
+  ze_api_version_t getDriverAPIVersion() const {
+    return l0Context.getDriverAPIVersion();
+  }
+
+  /// Return an event from the driver associated to this device
+  ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+
+  /// Release event to the pool associated to this device
+  void releaseEvent(ze_event_handle_t Event) {
+    l0Context.getEventPool().releaseEvent(Event, *this);
+  }
+
+  StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
+
+  bool supportsLargeMem() const { return l0Context.supportsLargeMem(); }
+
+  // Allocation related routines
+
+  /// Data alloc
+  void *dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+                  bool UserAlloc, bool DevMalloc = false,
+                  uint32_t MemAdvice = UINT32_MAX,
+                  AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+
+  /// Data delete
+  int32_t dataDelete(void *Ptr);
+
+  /// Return the memory allocation type for the specified memory location.
+  uint32_t getMemAllocType(const void *Ptr) const;
+
+  const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; }
+  MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; }
+
+  MemAllocatorTy &getMemAllocator(int32_t Kind) {
+    if (Kind == TARGET_ALLOC_HOST)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  MemAllocatorTy &getMemAllocator(const void *Ptr) {
+    bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr));
+    if (IsHostMem)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  int32_t makeMemoryResident(void *Mem, size_t Size);
+
+  // Generic device interface implementation
+  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                           int32_t ImageId) override;
+  Error unloadBinaryImpl(DeviceImageTy *Image) override;
+  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+  int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+
+  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN,
+                         "dataLockImpl not supported");
+  }
+  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
+
+  Expected<bool> isPinnedPtrImpl(void *, void *&, void *&,
+                                 size_t &) const override {
+    // Don't need to do anything, this is handled by the driver.
+    return false;
+  }
+
+  Error dataFence(__tgt_async_info *Async) override;
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool ReleaseQueue) override;
+  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override;
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initDeviceInfoImpl(__tgt_device_info *Info) override;
+  Expected<bool>
+  hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override{
+      L0_UNIMPLEMENTED_ERR}
+
+  /* Event routines are used to ensure ordering between dataTransfers. Instead
+   * of adding extra events in the queues, we make sure they're ordered by
+   * using the events from the data submission APIs so we don't need to support
+   * these routines.
+   * They still need to report succes to indicate the event are handled
+   * somewhere waitEvent and syncEvent should remain unimplemented
+   */
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    return true;
+  }
+
+  Error createEventImpl(void **EventPtrStorage) override {
+    return Plugin::success();
+  }
+  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Error syncEventImpl(void *EventPtr) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Expected<InfoTreeNode> obtainInfoImpl() override;
+
+  Error getDeviceStackSize(uint64_t &V) override {
+    V = 0;
+    return Plugin::success();
+  }
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override;
+
+  Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); }
+  Error getDeviceHeapSize(uint64_t &V) override {
+    V = HeapSize;
+    return Plugin::success();
+  }
+  Error setDeviceHeapSize(uint64_t V) override {
+    HeapSize = V;
+    return Plugin::success();
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override;
+  Error releaseInterop(omp_interop_val_t *Interop) override;
+
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
new file mode 100644
index 0000000000000..4b8b417f9b339
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -0,0 +1,25 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interop support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace llvm::omp::target::plugin::L0Interop {
+
+/// Level Zero interop property
+struct Property {
+  // Use this when command queue needs to be accessed as
+  // the targetsync field in interop will be changed if preferred type is sycl.
+  ze_command_queue_handle_t CommandQueue;
+  ze_command_list_handle_t ImmCmdList;
+};
+
+} // namespace llvm::omp::target::plugin::L0Interop
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
new file mode 100644
index 0000000000000..bc6fc54cdea08
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -0,0 +1,154 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+#include "PluginInterface.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+class L0ProgramTy;
+
+/// Loop descriptor
+struct TgtLoopDescTy {
+  int64_t Lb = 0;     // The lower bound of the i-th loop
+  int64_t Ub = 0;     // The upper bound of the i-th loop
+  int64_t Stride = 0; // The stride of the i-th loop
+};
+
+struct TgtNDRangeDescTy {
+  int32_t NumLoops = 0;      // Number of loops/dimensions
+  int32_t DistributeDim = 0; // Dimensions lower than this one
+                             // must end up in one WG
+  TgtLoopDescTy Levels[3];   // Up to 3 loops
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+  uint32_t Width = 0;
+  uint32_t SIMDWidth = 0;
+  uint32_t MaxThreadGroupSize = 0;
+
+  /// Cached input parameters used in the previous launch
+  TgtNDRangeDescTy LoopDesc;
+  int32_t NumTeams = -1;
+  int32_t ThreadLimit = -1;
+
+  /// Cached parameters used in the previous launch
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX;
+  uint32_t GroupSizes[3] = {0, 0, 0};
+  ze_group_count_t GroupCounts{0, 0, 0};
+  bool AllowCooperative = false;
+
+  std::mutex Mtx;
+
+  static constexpr TgtNDRangeDescTy LoopDescInit = {};
+
+  /// Check if we can reuse group parameters.
+  bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        uint32_t *_GroupSizes, ze_group_count_t &_GroupCounts,
+                        bool &_AllowCooperative) const {
+    if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (_NumTeams != NumTeams || _ThreadLimit != ThreadLimit)
+      return false;
+    // Found matching input parameters.
+    std::copy_n(GroupSizes, 3, _GroupSizes);
+    _GroupCounts = GroupCounts;
+    _AllowCooperative = AllowCooperative;
+    return true;
+  }
+
+  /// Update cached group parameters.
+  void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        const uint32_t *_GroupSizes,
+                        const ze_group_count_t &_GroupCounts,
+                        const bool &_AllowCooperative) {
+    LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
+    NumTeams = _NumTeams;
+    ThreadLimit = _ThreadLimit;
+    std::copy_n(_GroupSizes, 3, GroupSizes);
+    GroupCounts = _GroupCounts;
+    AllowCooperative = _AllowCooperative;
+  }
+};
+
+class L0KernelTy : public GenericKernelTy {
+  // L0 Kernel Handle
+  ze_kernel_handle_t zeKernel;
+  // Kernel Properties
+  KernelPropertiesTy Properties;
+  auto &getProperties() { return Properties; }
+
+  int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+                              KernelLaunchParamsTy LaunchParams,
+                              __tgt_async_info *AsyncInfo) const;
+
+  void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
+                                  uint32_t ThreadLimit,
+                                  TgtNDRangeDescTy *LoopLevels,
+                                  uint32_t *GroupSizes,
+                                  ze_group_count_t &GroupCounts,
+                                  bool HalfNumThreads,
+                                  bool IsTeamsNDRange) const;
+
+  int32_t decideLoopKernelGroupArguments(
+      L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+      uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+      bool &AllowCooperative) const;
+
+  Error buildKernel(L0ProgramTy &Program);
+
+public:
+  /// Create a L0 kernel with a name and an execution mode.
+  L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
+  ~L0KernelTy() {
+    if (zeKernel)
+      CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
+  }
+  L0KernelTy(const L0KernelTy &) = delete;
+  L0KernelTy(L0KernelTy &&) = delete;
+  L0KernelTy &operator=(const L0KernelTy &) = delete;
+  L0KernelTy &operator=(const L0KernelTy &&) = delete;
+
+  const auto &getProperties() const { return Properties; }
+
+  /// Initialize the L0 kernel.
+  Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
+  /// Launch the L0 kernel function.
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
+                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   KernelLaunchParamsTy LaunchParams,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override{
+      L0_UNIMPLEMENTED_ERR}
+
+  ze_kernel_handle_t getZeKernel() const {
+    return zeKernel;
+  }
+
+  int32_t getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                         int32_t ThreadLimit, uint32_t *GroupSizes,
+                         ze_group_count_t &GroupCounts, void *LoopDesc,
+                         bool &AllowCooperative) const;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
new file mode 100644
index 0000000000000..50af80a19a93a
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -0,0 +1,574 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind)                                                \
+  (Kind == TARGET_ALLOC_HOST                                                   \
+       ? "host memory"                                                         \
+       : (Kind == TARGET_ALLOC_SHARED                                          \
+              ? "shared memory"                                                \
+              : (Kind == TARGET_ALLOC_DEVICE ? "device memory"                 \
+                                             : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+  /// Base address memory is allocated from
+  uintptr_t AllocBase = 0;
+  /// Minimal size served by the current heap
+  size_t BlockSize = 0;
+  /// Max size served by the current heap
+  size_t MaxSize = 0;
+  /// Available memory blocks
+  uint32_t NumBlocks = 0;
+  /// Number of block descriptors
+  uint32_t NumBlockDesc = 0;
+  /// Number of block counters
+  uint32_t NumBlockCounter = 0;
+  /// List of memory block descriptors
+  uint64_t *BlockDesc = nullptr;
+  /// List of memory block counters
+  uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+  /// Location of device memory blocks
+  void *PoolBase = nullptr;
+  /// Heap size common to all heaps
+  size_t HeapSize = 0;
+  /// Number of heaps available
+  uint32_t NumHeaps = 0;
+  /// Heap descriptors (using fixed-size array to simplify memory allocation)
+  DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+  /// Base address allocated from compute runtime
+  void *Base = nullptr;
+  /// Allocation size known to users/libomptarget
+  size_t Size = 0;
+  /// TARGET_ALLOC kind
+  int32_t Kind = TARGET_ALLOC_DEFAULT;
+  /// Allocation from pool?
+  bool InPool = false;
+  /// Is implicit argument
+  bool ImplicitArg = false;
+
+  MemAllocInfoTy() = default;
+
+  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+                 bool _ImplicitArg)
+      : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+        ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+  /// Simple memory allocation statistics. Maintains numbers for pool allocation
+  /// and GPU RT allocation.
+  struct MemStatTy {
+    size_t Requested[2] = {0, 0}; // Requested bytes
+    size_t Allocated[2] = {0, 0}; // Allocated bytes
+    size_t Freed[2] = {0, 0};     // Freed bytes
+    size_t InUse[2] = {0, 0};     // Current memory in use
+    size_t PeakUse[2] = {0, 0};   // Peak bytes used
+    size_t NumAllocs[2] = {0, 0}; // Number of allocations
+    MemStatTy() = default;
+  };
+
+  /// Memory pool which enables reuse of already allocated blocks
+  /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+  ///    memory.
+  /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+  /// -- Each memory block can allocate multiple fixed-size memory requested by
+  ///    offload RT or user.
+  /// -- Memory allocation falls back to GPU RT allocation when the pool size
+  ///    (total memory used by pool) reaches a threshold.
+  class MemPoolTy {
+
+    /// Memory block maintained in each bucket
+    struct BlockTy {
+      /// Base address of this block
+      uintptr_t Base = 0;
+      /// Size of the block
+      size_t Size = 0;
+      /// Supported allocation size by this block
+      size_t ChunkSize = 0;
+      /// Total number of slots
+      uint32_t NumSlots = 0;
+      /// Number of slots in use
+      uint32_t NumUsedSlots = 0;
+      /// Cached available slot returned by the last dealloc() call
+      uint32_t FreeSlot = UINT32_MAX;
+      /// Marker for the currently used slots
+      std::vector<bool> UsedSlots;
+
+      BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+        Base = reinterpret_cast<uintptr_t>(_Base);
+        Size = _Size;
+        ChunkSize = _ChunkSize;
+        NumSlots = Size / ChunkSize;
+        NumUsedSlots = 0;
+        UsedSlots.resize(NumSlots, false);
+      }
+
+      /// Check if the current block is fully used
+      bool isFull() const { return NumUsedSlots == NumSlots; }
+
+      /// Check if the given address belongs to the current block
+      bool contains(void *Mem) const {
+        auto M = reinterpret_cast<uintptr_t>(Mem);
+        return M >= Base && M < Base + Size;
+      }
+
+      /// Allocate a single chunk from the block
+      void *alloc();
+
+      /// Deallocate the given memory
+      void dealloc(void *Mem);
+    }; // BlockTy
+
+    /// Allocation kind for the current pool
+    int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+    /// Access to the allocator
+    MemAllocatorTy *Allocator = nullptr;
+    /// Minimum supported memory allocation size from pool
+    size_t AllocMin = 1 << 6; // 64B
+    /// Maximum supported memory allocation size from pool
+    size_t AllocMax = 0;
+    /// Allocation size when the pool needs to allocate a block
+    size_t AllocUnit = 1 << 16; // 64KB
+    /// Capacity of each block in the buckets which decides number of
+    /// allocatable chunks from the block. Each block in the bucket can serve
+    /// at least BlockCapacity chunks.
+    /// If ChunkSize * BlockCapacity <= AllocUnit
+    ///   BlockSize = AllocUnit
+    /// Otherwise,
+    ///   BlockSize = ChunkSize * BlockCapacity
+    /// This simply means how much memory is over-allocated.
+    uint32_t BlockCapacity = 0;
+    /// Total memory allocated from GPU RT for this pool
+    size_t PoolSize = 0;
+    /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+    /// when PoolSize reaches PoolSizeMax.
+    size_t PoolSizeMax = 0;
+    /// Small allocation size allowed in the pool even if pool size is over the
+    /// pool size limit
+    size_t SmallAllocMax = 1024;
+    /// Small allocation pool size
+    size_t SmallPoolSize = 0;
+    /// Small allocation pool size max (4MB)
+    size_t SmallPoolSizeMax = (4 << 20);
+    /// List of buckets
+    std::vector<std::vector<BlockTy *>> Buckets;
+    /// List of bucket parameters
+    std::vector<std::pair<size_t, size_t>> BucketParams;
+    /// Map from allocated pointer to corresponding block.
+    std::unordered_map<void *, BlockTy *> PtrToBlock;
+    /// Simple stats counting miss/hit in each bucket.
+    std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+    /// Need to zero-initialize after L0 allocation
+    bool ZeroInit = false;
+    /// Zero-initialized values to be copied to device
+    std::vector<char> ZeroInitValue;
+
+    /// Get bucket ID from the specified allocation size.
+    uint32_t getBucketId(size_t Size) {
+      uint32_t Count = 0;
+      for (size_t SZ = AllocMin; SZ < Size; Count++)
+        SZ <<= 1;
+      return Count;
+    }
+
+  public:
+    MemPoolTy() = default;
+
+    /// Construct pool with allocation kind, allocator, and user options.
+    MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+              const L0OptionsTy &Option);
+    // Used for reduction pool
+    MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    // Used for small memory pool with fixed parameters
+    MemPoolTy(MemAllocatorTy *_Allocator);
+
+    MemPoolTy(const MemPoolTy &) = delete;
+    MemPoolTy(MemPoolTy &&) = delete;
+    MemPoolTy &operator=(const MemPoolTy &) = delete;
+    MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+    void printUsage();
+    /// Release resources used in the pool.
+    ~MemPoolTy();
+
+    /// Allocate the requested size of memory from this pool.
+    /// AllocSize is the chunk size internally used for the returned memory.
+    void *alloc(size_t Size, size_t &AllocSize);
+    /// Deallocate the specified memory and returns block size deallocated.
+    size_t dealloc(void *Ptr);
+  }; // MemPoolTy
+
+  /// Allocation information maintained in the plugin
+  class MemAllocInfoMapTy {
+    /// Map from allocated pointer to allocation information
+    std::map<void *, MemAllocInfoTy> Map;
+    /// Map from target alloc kind to number of implicit arguments
+    std::map<int32_t, uint32_t> NumImplicitArgs;
+
+  public:
+    /// Add allocation information to the map
+    void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
+             bool InPool = false, bool ImplicitArg = false);
+
+    /// Remove allocation information for the given memory location
+    bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
+
+    /// Finds allocation information for the given memory location
+    const MemAllocInfoTy *find(void *Ptr) const {
+      auto AllocInfo = Map.find(Ptr);
+      if (AllocInfo == Map.end())
+        return nullptr;
+      else
+        return &AllocInfo->second;
+    }
+
+    /// Check if the map contains the given pointer and offset
+    bool contains(const void *Ptr, size_t Size) const {
+      if (Map.size() == 0)
+        return false;
+      auto I = Map.upper_bound(const_cast<void *>(Ptr));
+      if (I == Map.begin())
+        return false;
+      --I;
+      bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+                 (uintptr_t)Ptr + (uintptr_t)Size <=
+                     (uintptr_t)I->first + (uintptr_t)I->second.Size;
+      return Ret;
+    }
+
+    /// Returns the number of implicit arguments for the specified allocation
+    /// kind.
+    size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+  }; // MemAllocInfoMapTy
+
+  /// L0 context to use
+  const L0ContextTy *L0Context = nullptr;
+  /// L0 device to use
+  L0DeviceTy *Device = nullptr;
+  /// Whether the device supports large memory allocation
+  bool SupportsLargeMem = false;
+  /// Cached max alloc size supported by device
+  uint64_t MaxAllocSize = INT64_MAX;
+  /// Map from allocation kind to memory statistics
+  std::unordered_map<int32_t, MemStatTy> Stats;
+  /// Map from allocation kind to memory pool
+  std::unordered_map<int32_t, MemPoolTy> Pools;
+  /// Memory pool dedicated to reduction scratch space
+  std::unique_ptr<MemPoolTy> ReductionPool;
+  /// Memory pool dedicated to reduction counters
+  std::unique_ptr<MemPoolTy> CounterPool;
+  /// Allocation information map
+  MemAllocInfoMapTy AllocInfo;
+  /// RTL-owned memory that needs to be freed automatically
+  std::list<void *> MemOwned;
+  /// Lock protection
+  std::mutex Mtx;
+  /// Allocator only supports host memory
+  bool IsHostMem = false;
+  // Internal deallocation function to be called when already
+  // hondling the Mtx lock
+  int32_t dealloc_locked(void *Ptr);
+
+public:
+  MemAllocatorTy() = default;
+
+  MemAllocatorTy(const MemAllocatorTy &) = delete;
+  MemAllocatorTy(MemAllocatorTy &&) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
+
+  /// Release resources and report statistics if requested
+  ~MemAllocatorTy() {
+    if (L0Context)
+      deinit(); // Release resources
+  }
+  void deinit();
+
+  /// Allocator only supports host memory
+  bool supportsHostMem() { return IsHostMem; }
+
+  void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+  void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+  void updateMaxAllocSize(L0DeviceTy &L0Device);
+
+  /// Allocate memory from L0 GPU RT. We use over-allocation workaround
+  /// to support target pointer with offset, and positive "ActiveSize" is
+  /// specified in such cases for correct debug logging.
+  void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+
+  /// Allocate memory with the specified information from a memory pool
+  void *alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+              bool UserAlloc, bool DevMalloc, uint32_t MemAdvice,
+              AllocOptionTy AllocOpt);
+
+  /// Deallocate memory
+  int32_t dealloc(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return dealloc_locked(Ptr);
+  }
+
+  /// Check if the given memory location and offset belongs to any allocated
+  /// memory
+  bool contains(const void *Ptr, size_t Size) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.contains(Ptr, Size);
+  }
+
+  /// Get allocation information for the specified memory location
+  const MemAllocInfoTy *getAllocInfo(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.find(Ptr);
+  }
+
+  /// Get kernel indirect access flags using implicit argument info
+  ze_kernel_indirect_access_flags_t getIndirectFlags() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ze_kernel_indirect_access_flags_t Ret = 0;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+    return Ret;
+  }
+
+  /// Log memory allocation/deallocation
+  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+    if (Stats.count(Kind) == 0)
+      return; // Stat is disabled
+
+    auto &ST = Stats[Kind];
+    int32_t I = Pool ? 1 : 0;
+    if (ReqSize > 0) {
+      ST.Requested[I] += ReqSize;
+      ST.Allocated[I] += Size;
+      ST.InUse[I] += Size;
+      ST.NumAllocs[I]++;
+    } else {
+      ST.Freed[I] += Size;
+      ST.InUse[I] -= Size;
+    }
+    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+  }
+
+  /// Perform copy operation
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+}; /// MemAllocatorTy
+
+// simple generic wrapper to reuse objects
+// objects must have zero argument accessible constructor
+template <class ObjTy> class ObjPool {
+  // Protection
+  std::unique_ptr<std::mutex> Mtx;
+  // List of Objects
+  std::list<ObjTy *> Objects;
+
+public:
+  ObjPool() { Mtx.reset(new std::mutex); }
+
+  ObjPool(const ObjPool &) = delete;
+  ObjPool(ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &&) = delete;
+
+  ObjTy *get() {
+    if (!Objects.empty()) {
+      std::lock_guard<std::mutex> Lock(*Mtx);
+      if (!Objects.empty()) {
+        const auto Ret = Objects.back();
+        Objects.pop_back();
+        return Ret;
+      }
+    }
+    return new ObjTy();
+  }
+
+  void release(ObjTy *obj) {
+    std::lock_guard<std::mutex> Lock(*Mtx);
+    Objects.push_back(obj);
+  }
+
+  ~ObjPool() {
+    for (auto object : Objects)
+      delete object;
+  }
+};
+
+/// Common event pool used in the plugin. This event pool assumes all events
+/// from the pool are host-visible and use the same event pool flag.
+class EventPoolTy {
+  /// Size of L0 event pool created on demand
+  size_t PoolSize = 64;
+
+  /// Context of the events
+  ze_context_handle_t Context = nullptr;
+
+  /// Additional event pool flags common to this pull
+  uint32_t Flags = 0;
+
+  /// Protection
+  std::unique_ptr<std::mutex> Mtx;
+
+  /// List of created L0 event pools
+  std::list<ze_event_pool_handle_t> Pools;
+
+  /// List of free L0 events
+  std::list<ze_event_handle_t> Events;
+
+#ifdef OMPT_SUPPORT
+  /// Event to OMPT record map. The timestamp information is recorded to the
+  /// OMPT record before the event is recycled.
+  std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
+#endif // OMPT_SUPPORT
+
+public:
+  /// Initialize context, flags, and mutex
+  void init(ze_context_handle_t _Context, uint32_t _Flags) {
+    Context = _Context;
+    Flags = _Flags;
+    Mtx.reset(new std::mutex);
+  }
+
+  /// Destroys L0 resources
+  void deinit() {
+    for (auto E : Events)
+      CALL_ZE_RET_VOID(zeEventDestroy, E);
+    for (auto P : Pools)
+      CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+  }
+
+  /// Get a free event from the pool
+  ze_event_handle_t getEvent();
+
+  /// Return an event to the pool
+  void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+};
+
+/// Staging buffer
+/// A single staging buffer is not enough when batching is enabled since there
+/// can be multiple pending copy operations.
+class StagingBufferTy {
+  /// Context for L0 calls
+  ze_context_handle_t Context = nullptr;
+  /// Max allowed size for staging buffer
+  size_t Size = L0StagingBufferSize;
+  /// Number of buffers allocated together
+  size_t Count = L0StagingBufferCount;
+  /// Buffers increasing by Count if a new buffer is required
+  std::list<void *> Buffers;
+  /// Next buffer location in the buffers
+  size_t Offset = 0;
+
+  void *addBuffers() {
+    ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                       nullptr, 0};
+    void *Ret = nullptr;
+    size_t AllocSize = Size * Count;
+    CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+                     L0Alignment, &Ret);
+    Buffers.push_back(Ret);
+    return Ret;
+  }
+
+public:
+  StagingBufferTy() = default;
+  StagingBufferTy(const StagingBufferTy &) = delete;
+  StagingBufferTy(StagingBufferTy &&) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
+
+  ~StagingBufferTy() {
+    if (initialized())
+      clear();
+  }
+
+  void clear() {
+    ze_result_t Rc;
+    (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
+    for (auto Ptr : Buffers)
+      CALL_ZE(Rc, zeMemFree, Context, Ptr);
+    Context = nullptr;
+  }
+
+  bool initialized() const { return Context != nullptr; }
+
+  void init(ze_context_handle_t _Context, size_t _Size, size_t _Count) {
+    Context = _Context;
+    Size = _Size;
+    Count = _Count;
+  }
+
+  void reset() { Offset = 0; }
+
+  /// Always return the first buffer
+  void *get() {
+    if (Size == 0 || Count == 0)
+      return nullptr;
+    return Buffers.empty() ? addBuffers() : Buffers.front();
+  }
+
+  /// Return the next available buffer
+  void *getNext() {
+    void *Ret = nullptr;
+    if (Size == 0 || Count == 0)
+      return Ret;
+
+    size_t AllocSize = Size * Count;
+    bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
+    if (NeedToGrow)
+      Ret = addBuffers();
+    else
+      Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+
+    if (!Ret)
+      return nullptr;
+
+    Offset += Size;
+    return Ret;
+  }
+
+  /// Return either a fixed buffer or next buffer
+  void *get(bool Next) { return Next ? getNext() : get(); }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
new file mode 100644
index 0000000000000..b3ecd25f56ddd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+  std::vector<uint32_t> ConstantIds;
+  std::vector<const void *> ConstantValues;
+
+public:
+  SpecConstantsTy() = default;
+  SpecConstantsTy(const SpecConstantsTy &) = delete;
+  SpecConstantsTy(SpecConstantsTy &&) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+  SpecConstantsTy(const SpecConstantsTy &&Other)
+      : ConstantIds(std::move(Other.ConstantIds)),
+        ConstantValues(std::move(Other.ConstantValues)) {}
+
+  ~SpecConstantsTy() {
+    for (auto I : ConstantValues) {
+      const char *ValuePtr = reinterpret_cast<const char *>(I);
+      delete[] ValuePtr;
+    }
+  }
+
+  template <typename T> void addConstant(uint32_t Id, T Val) {
+    const size_t ValSize = sizeof(Val);
+    char *ValuePtr = new char[ValSize];
+    *reinterpret_cast<T *>(ValuePtr) = Val;
+
+    ConstantIds.push_back(Id);
+    ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+  }
+
+  ze_module_constants_t getModuleConstants() const {
+    ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+                              ConstantIds.data(),
+                              // Unfortunately we have to const_cast it.
+                              // L0 data type should probably be fixed.
+                              const_cast<const void **>(ConstantValues.data())};
+    return Tmp;
+  }
+};
+#define FIXED static constexpr
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+  uint64_t UseMemoryPool : 1;
+  uint64_t Reserved : 63;
+  L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+  /// Binary flags
+  L0OptionFlagsTy Flags;
+
+  /// Staging buffer size
+  size_t StagingBufferSize = L0StagingBufferSize;
+
+  /// Staging buffer count
+  size_t StagingBufferCount = L0StagingBufferCount;
+
+  // TODO: This should probably be an array indexed by AllocKind
+  /// Memory pool parameters
+  /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+  std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+      {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+      {TARGET_ALLOC_HOST, {1, 4, 256}},
+      {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+  /// Parameters for memory pools dedicated to reduction scratch space
+  std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+  /// Oversubscription rate for normal kernels
+  FIXED uint32_t SubscriptionRate = 4;
+
+  /// Loop kernels with known ND-range may be known to have
+  /// few iterations and they may not exploit the offload device
+  /// to the fullest extent.
+  /// Let's assume a device has N total HW threads available,
+  /// and the kernel requires M hardware threads with LWS set to L.
+  /// If (M < N * ThinThreadsThreshold), then we will try
+  /// to iteratively divide L by 2 to increase the number of HW
+  /// threads used for executing the kernel. Effectively, we will
+  /// end up with L less than the kernel's SIMD width, so the HW
+  /// threads will not use all their SIMD lanes. This (presumably) should
+  /// allow more parallelism, because the stalls in the SIMD lanes
+  /// will be distributed across more HW threads, and the probability
+  /// of having a stall (or a sequence of stalls) on a critical path
+  /// in the kernel should decrease.
+  /// Anyway, this is just a heuristics that seems to work well for some
+  /// kernels (which poorly expose parallelism in the first place).
+  FIXED double ThinThreadsThreshold = 0.1;
+
+  /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
+  /// All the discard filter should be before the accept filter.
+  std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
+
+  /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+  bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
+
+  // Compilation options for IGC
+  // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+  // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+  // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+  // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+  // builtins.
+  std::string CompilationOptions = "-cl-std=CL2.0 ";
+  std::string InternalCompilationOptions = "-cl-take-global-address";
+  std::string UserCompilationOptions = "";
+
+  // Spec constants used for all modules.
+  SpecConstantsTy CommonSpecConstants;
+
+  /// Command execution mode.
+  /// Whether the runtime uses asynchronous mode or not depends on the type of
+  /// devices and whether immediate command list is fully enabled.
+  CommandModeTy CommandMode = CommandModeTy::Async;
+
+  bool Init = false; // have the options already been processed
+
+  /// Read environment variables
+  L0OptionsTy() {}
+
+  void processEnvironmentVars();
+
+  void init() {
+    if (!Init) {
+      processEnvironmentVars();
+      Init = true;
+    }
+  }
+
+  /// Parse the string and split it into tokens of string_views based on the
+  /// Delim character.
+  std::vector<std::string_view> tokenize(const std::string_view &Filter,
+                                         const std::string &Delim,
+                                         bool ProhibitEmptyTokens = false);
+
+  bool isDigits(const std::string_view &str) {
+    if (str.size() == 0)
+      return false;
+    return std::all_of(str.begin(), str.end(), ::isdigit);
+  }
+
+  bool match(const std::string &Var, const std::string &Matched) {
+    if (Var.size() != Matched.size())
+      return false;
+
+    auto equals = [](char a, char b) {
+      return std::tolower(a) == std::tolower(b);
+    };
+    return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
+                      equals);
+  }
+
+  bool match(const std::string &Var, const char *Matched) {
+    std::string Str(Matched);
+    return match(Var, Str);
+  }
+
+  bool match(const StringEnvar &Var, const char *Matched) {
+    return match(Var.get(), Matched);
+  }
+
+}; // L0OptionsTy
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
new file mode 100644
index 0000000000000..4658c1cdab1df
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -0,0 +1,136 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Plugin interface for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Defs.h"
+#include "L0Device.h"
+#include "L0Memory.h"
+#include "L0Options.h"
+#include "L0Program.h"
+#include "TLS.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Class implementing the LevelZero specific functionalities of the plugin.
+class LevelZeroPluginTy final : public GenericPluginTy {
+private:
+  /// Number of devices available including subdevices
+  uint32_t NumDevices = 0;
+
+  /// Context (and Driver) specific data
+  std::list<L0ContextTy> ContextList;
+
+  /// L0 device used by each OpenMP device
+  using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
+  DeviceContainerTy L0Devices;
+
+  // Table containing per-thread information using TLS
+  L0ThreadTblTy ThreadTLSTable;
+  // Table containing per-thread information for each device using TLS
+  L0DeviceTLSTableTy DeviceTLSTable;
+  // Table containing per-thread information for each Context using TLS
+  L0ContextTLSTableTy ContextTLSTable;
+
+  /// L0 plugin global options
+  static L0OptionsTy Options;
+
+  /// Global mutex
+  std::mutex GlobalMutex;
+
+  /// Common pool of AsyncQueue
+  AsyncQueuePoolTy AsyncQueuePool;
+
+  auto &getTLS() { return ThreadTLSTable.get(); }
+
+public:
+  LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
+  virtual ~LevelZeroPluginTy() {}
+
+  auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
+  auto &getContextTLS(ze_context_handle_t Context) {
+    return ContextTLSTable.get(Context);
+  }
+
+  static const auto &getOptions() { return Options; }
+
+  auto &getGlobalMutex() { return GlobalMutex; }
+
+  struct DevicesRangeTy {
+    using iterator = DeviceContainerTy::iterator;
+
+    iterator BeginIt;
+    iterator EndIt;
+
+    DevicesRangeTy(iterator BeginIt, iterator EndIt)
+        : BeginIt(BeginIt), EndIt(EndIt) {}
+
+    auto &begin() { return BeginIt; }
+    auto &end() { return EndIt; }
+  };
+
+  auto getDevicesRange() {
+    return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
+  }
+
+  /// Clean-up routine to be invoked by the destructor or
+  /// LevelZeroPluginTy::deinit.
+  void closeRTL();
+
+  /// Find L0 devices and initialize device properties.
+  /// Returns number of devices reported to omptarget.
+  int32_t findDevices();
+
+  L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
+    assert("Invalid device ID" && DeviceId >= 0 &&
+           DeviceId < static_cast<int32_t>(L0Devices.size()));
+    return *L0Devices[DeviceId];
+  }
+
+  uint32_t getNumRootDevices() const { return NumDevices; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    auto *Queue = getTLS().getAsyncQueue();
+    if (!Queue)
+      Queue = AsyncQueuePool.get();
+    return Queue;
+  }
+
+  void releaseAsyncQueue(AsyncQueueTy *Queue) {
+    if (!Queue)
+      return;
+    Queue->reset();
+    Queue->InUse = false;
+    if (!getTLS().releaseAsyncQueue(Queue))
+      AsyncQueuePool.release(Queue);
+  }
+
+  // Plugin interface
+
+  Expected<int32_t> initImpl() override;
+  Error deinitImpl() override;
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override;
+  GenericGlobalHandlerTy *createGlobalHandler() override;
+  uint16_t getMagicElfBits() const override;
+  Triple::ArchType getTripleArch() const override;
+  const char *getName() const override;
+  Expected<bool> isELFCompatible(uint32_t DeviceId,
+                                 StringRef Image) const override;
+
+  Error flushQueueImpl(omp_interop_val_t *Interop) override;
+  Error syncBarrierImpl(omp_interop_val_t *Interop) override;
+  Error asyncBarrierImpl(omp_interop_val_t *Interop) override;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
new file mode 100644
index 0000000000000..a548b486f4642
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -0,0 +1,135 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Kernel.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+/// Program data to be initialized by plugin
+struct ProgramDataTy {
+  int Initialized = 0;
+  int NumDevices = 0;
+  int DeviceNum = -1;
+  uint32_t TotalEUs = 0;
+  uint32_t HWThreadsPerEU = 0;
+  uintptr_t DynamicMemoryLB = 0;
+  uintptr_t DynamicMemoryUB = 0;
+  int DeviceType = 0;
+  void *DynamicMemPool = nullptr;
+  int TeamsThreadLimit = 0;
+};
+
+/// Level Zero program that can contain multiple modules.
+class L0ProgramTy : public DeviceImageTy {
+  /// Handle multiple modules within a single target image
+  llvm::SmallVector<ze_module_handle_t> Modules;
+
+  /// Map of kernel names to Modules
+  std::unordered_map<std::string, ze_module_handle_t> KernelsToModuleMap;
+
+  /// List of kernels built for this image
+  /// We need to delete them ourselves as the main library is not doing
+  /// that right now
+  std::list<L0KernelTy *> Kernels;
+
+  /// Module that contains global data including device RTL
+  ze_module_handle_t GlobalModule = nullptr;
+
+  /// Requires module link
+  bool RequiresModuleLink = false;
+
+  /// Is this module library
+  bool IsLibModule = false;
+
+  /// Build a single module with the given image, build option, and format.
+  int32_t addModule(const size_t Size, const uint8_t *Image,
+                    const std::string &BuildOption, ze_module_format_t Format);
+  /// Read file and return the size of the binary if successful.
+  size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
+  int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
+  void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                        std::string &Options) const;
+
+  /// Check if the image should be handled as a library module
+  void setLibModule();
+
+  L0DeviceTy &getL0Device() const;
+
+public:
+  L0ProgramTy() = delete;
+
+  L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
+              const __tgt_device_image *Image)
+      : DeviceImageTy(ImageId, Device, Image) {}
+
+  ~L0ProgramTy();
+
+  L0ProgramTy(const L0ProgramTy &other) = delete;
+  L0ProgramTy(L0ProgramTy &&) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
+
+  static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
+    return static_cast<L0ProgramTy &>(Device);
+  }
+
+  /// Build modules from the target image description
+  int32_t buildModules(std::string &BuildOptions);
+
+  /// Link modules stored in \p Modules.
+  int32_t linkModules();
+
+  /// Loads the kernels names from all modules
+  int32_t loadModuleKernels();
+
+  /// Read data from the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+
+  /// Write data to the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t writeGlobalVariable(const char *Name, size_t Size,
+                              const void *HostPtr);
+
+  /// Looks up an OpenMP declare target global variable with the given
+  /// \p Name and \p Size in the device environment for the current device.
+  /// The lookup is first done via the device offload table. If it fails,
+  /// then the lookup falls back to non-OpenMP specific lookup on the device.
+  void *getOffloadVarDeviceAddr(const char *Name) const;
+
+  /// Returns the handle of a module that contains a given Kernel name
+  ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
+    auto K = KernelsToModuleMap.find(std::string(KernelName));
+    if (K == KernelsToModuleMap.end())
+      return nullptr;
+
+    return K->second;
+  }
+
+  void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); }
+};
+
+struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override;
+};
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
new file mode 100644
index 0000000000000..2eeae81016dee
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -0,0 +1,193 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing L0
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+#pragma once
+
+#include "Shared/Debug.h"
+#include "omptarget.h"
+#include <string>
+#include <level_zero/ze_api.h>
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define DPCALL(...)                                                            \
+  do {                                                                         \
+    if (getDebugLevel() > 1)                                                   \
+      DP(__VA_ARGS__);                                                         \
+  } while (0)
+
+#define FATAL_ERROR(Msg)                                                       \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
+    exit(EXIT_FAILURE);                                                        \
+  } while (0)
+
+#define WARNING(...)                                                           \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Warning: " __VA_ARGS__);                                  \
+  } while (0)
+
+#define INVALID_OPTION(Name, Value)                                            \
+  WARNING("Ignoring invalid option " #Name "=%s\n", Value)
+
+#define CALL_ZE(Rc, Fn, ...)                                                   \
+  do {                                                                         \
+      Rc = Fn(__VA_ARGS__);                                                    \
+  } while (0)
+
+#define CALL_ZE_RC(Rc, Fn, ...)                                                \
+  do {                                                                         \
+    CALL_ZE(Rc, Fn, __VA_ARGS__);                                              \
+    if (Rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getZeErrorName(Rc));                                                  \
+    }                                                                          \
+  } while(0)
+
+/// For non-thread-safe functions
+#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...)                                     \
+  do {                                                                         \
+    Mtx.lock();                                                                \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    Mtx.unlock();                                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+
+/// For thread-safe functions
+#define CALL_ZE_RET(Ret, Fn, ...)                                              \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR(Fn, ...)                                             \
+  CALL_ZE_RET(                                                                 \
+    Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",           \
+    STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
+
+
+
+#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...)                                     \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      const char *err_str = nullptr;                                           \
+      rc = zeDriverGetLastErrorDescription(                                    \
+          Dev.getDriverHandle(), &err_str);                                    \
+      fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn,          \
+              err_str);                                                        \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXIT_FAIL(Fn, ...)                                             \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...)                         \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__);                         \
+    if (rc != ZE_RESULT_SUCCESS)                                               \
+      return Ret;                                                              \
+  } while (0)
+
+
+#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...)                               \
+  CALL_ZE_EXT_SILENT_RET(Device,                                               \
+      Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s",          \
+			 STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__)                    
+
+#define FOREACH_ZE_ERROR_CODE(Fn)                                              \
+  Fn(ZE_RESULT_SUCCESS)                                                        \
+  Fn(ZE_RESULT_NOT_READY)                                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_LOST)                                              \
+  Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)                                       \
+  Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE)                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET)                                    \
+  Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE)                                \
+  Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)                                 \
+  Fn(ZE_RESULT_ERROR_NOT_AVAILABLE)                                            \
+  Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)                                   \
+  Fn(ZE_RESULT_WARNING_DROPPED_DATA)                                           \
+  Fn(ZE_RESULT_ERROR_UNINITIALIZED)                                            \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT)                                         \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)                                      \
+  Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_SIZE)                                             \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)                                         \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)                                  \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)                                 \
+  Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)                            \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)                                  \
+  Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)                                \
+  Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)                                      \
+  Fn(ZE_RESULT_WARNING_ACTION_REQUIRED)                                        \
+  Fn(ZE_RESULT_ERROR_UNKNOWN)
+
+#define CASE_TO_STRING(Num) case Num: return #Num;
+inline const char *getZeErrorName(int32_t Error) {
+  switch (Error) {
+    FOREACH_ZE_ERROR_CODE(CASE_TO_STRING)
+  default:
+    return "ZE_RESULT_ERROR_UNKNOWN";
+  }
+}
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
new file mode 100644
index 0000000000000..8a5f41312e129
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -0,0 +1,86 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thread Level Storage abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Memory.h"
+#include "L0Trace.h"
+#include "PerThreadTable.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// All thread-local data used by the Plugin
+class L0ThreadTLSTy {
+  /// Subdevice encoding
+  int64_t SubDeviceCode = 0;
+
+  /// Async info tracking
+  static constexpr int32_t PerThreadQueues = 10;
+  AsyncQueueTy AsyncQueues[PerThreadQueues];
+  int32_t UsedQueues = 0;
+
+public:
+  L0ThreadTLSTy() = default;
+  L0ThreadTLSTy(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy(L0ThreadTLSTy &&) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete;
+  ~L0ThreadTLSTy() {}
+
+  void clear() {}
+
+  int64_t getSubDeviceCode() { return SubDeviceCode; }
+
+  void setSubDeviceCode(int64_t Code) { SubDeviceCode = Code; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    AsyncQueueTy *ret = nullptr;
+    if (UsedQueues < PerThreadQueues) {
+      // there's a free queue in this thread, find it
+      for (int32_t q = 0; q < PerThreadQueues; q++) {
+        if (!AsyncQueues[q].InUse) {
+          UsedQueues++;
+          ret = &AsyncQueues[q];
+          break;
+        }
+      }
+      assert(ret && "A queue should have been found!");
+      ret->InUse = true;
+    }
+    return ret;
+  }
+
+  bool releaseAsyncQueue(AsyncQueueTy *queue) {
+    if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) {
+      // it's a local queue
+      queue->InUse = false;
+      UsedQueues--;
+      return true;
+    }
+    return false;
+  }
+};
+
+struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
+  void clear() {
+    PerThread::clear([](auto &Entry) { Entry.clear(); });
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
new file mode 100644
index 0000000000000..3f50ffd2a7260
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -0,0 +1,41 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Context.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+                         int32_t /*DriverId*/)
+    : Plugin(Plugin), zeDriver(zeDriver) {
+  CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+  DP("Driver API version is %" PRIx32 "\n", APIVersion);
+
+  ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+  CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
+
+  EventPool.init(zeContext, 0);
+  HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+}
+
+StagingBufferTy &L0ContextTy::getStagingBuffer() {
+  auto &TLS = Plugin.getContextTLS(getZeContext());
+  auto &Buffer = TLS.getStagingBuffer();
+  const auto &Options = Plugin.getOptions();
+  if (!Buffer.initialized())
+    Buffer.init(getZeContext(), Options.StagingBufferSize,
+                Options.StagingBufferCount);
+  return Buffer;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
new file mode 100644
index 0000000000000..0029d00a07685
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -0,0 +1,1065 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Device.h"
+#include "L0Defs.h"
+#include "L0Interop.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+L0DeviceTLSTy &L0DeviceTy::getTLS() {
+  return getPlugin().getDeviceTLS(getDeviceId());
+}
+
+// clang-format off
+/// Mapping from device arch to GPU runtime's device identifiers
+static struct {
+  DeviceArchTy arch;
+  PCIIdTy ids[10];
+} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::SKL,
+                       PCIIdTy::KBL,
+                       PCIIdTy::CFL, PCIIdTy::CFL_2,
+                       PCIIdTy::ICX,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::TGL, PCIIdTy::TGL_2,
+                       PCIIdTy::DG1,
+                       PCIIdTy::RKL,
+                       PCIIdTy::ADLS,
+                       PCIIdTy::RTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeLPG,
+                      {PCIIdTy::MTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPC,
+                      {PCIIdTy::PVC,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPG,
+                      {PCIIdTy::DG2_ATS_M,
+                       PCIIdTy::DG2_ATS_M_2,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2LP,
+                      {PCIIdTy::LNL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2HP,
+                      {PCIIdTy::BMG,
+                       PCIIdTy::None}},
+};
+constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]);
+// clang-format on
+
+DeviceArchTy L0DeviceTy::computeArch() const {
+  const auto PCIDeviceId = getPCIId();
+  if (PCIDeviceId != 0) {
+    for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+      for (int i = 0;; i++) {
+        const auto Id = DeviceArchMap[arch].ids[i];
+        if (Id == PCIIdTy::None)
+          break;
+
+        auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
+        if (maskedId == Id)
+          return DeviceArchMap[arch].arch; // Exact match or prefix match
+      }
+    }
+  }
+
+  DP("Warning: Cannot decide device arch for %s.\n", getNameCStr());
+  return DeviceArchTy::DeviceArch_None;
+}
+
+bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const {
+  ze_device_ip_version_ext_t IPVersion{};
+  IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+  IPVersion.pNext = nullptr;
+  ze_device_properties_t DevicePR{};
+  DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  DevicePR.pNext = &IPVersion;
+  CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR);
+  return IPVersion.ipVersion >= Version;
+}
+
+/// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findComputeOrdinal() {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+  for (uint32_t I = 0; I < Count; I++) {
+    // TODO: add a separate set of ordinals for compute queue groups which
+    // support cooperative kernels
+    if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      Ordinal.first = I;
+      Ordinal.second = Properties[I].numQueues;
+      break;
+    }
+  }
+  if (Ordinal.first == UINT32_MAX)
+    DP("Error: no command queues are found\n");
+
+  return Ordinal;
+}
+
+/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+
+  for (uint32_t I = 0; I < Count; I++) {
+    const auto &Flags = Properties[I].flags;
+    if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+        (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) {
+      auto NumQueues = Properties[I].numQueues;
+      if (LinkCopy && NumQueues > 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found link copy command queue for device " DPxMOD
+           ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n",
+           DPxPTR(zeDevice), Ordinal.first, Ordinal.second);
+        break;
+      } else if (!LinkCopy && NumQueues == 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32
+           "\n",
+           DPxPTR(zeDevice), Ordinal.first);
+        break;
+      }
+    }
+  }
+  return Ordinal;
+}
+
+void L0DeviceTy::reportDeviceInfo() const {
+  DP("Device %" PRIu32 "\n", DeviceId);
+  DP("-- Name                         : %s\n", getNameCStr());
+  DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
+  DP("-- UUID                         : %s\n", getUuid().c_str());
+  DP("-- Number of total EUs          : %" PRIu32 "\n", getNumEUs());
+  DP("-- Number of threads per EU     : %" PRIu32 "\n", getNumThreadsPerEU());
+  DP("-- EU SIMD width                : %" PRIu32 "\n", getSIMDWidth());
+  DP("-- Number of EUs per subslice   : %" PRIu32 "\n", getNumEUsPerSubslice());
+  DP("-- Number of subslices per slice: %" PRIu32 "\n",
+     getNumSubslicesPerSlice());
+  DP("-- Number of slices             : %" PRIu32 "\n", getNumSlices());
+  DP("-- Local memory size (bytes)    : %" PRIu32 "\n",
+     getMaxSharedLocalMemory());
+  DP("-- Global memory size (bytes)   : %" PRIu64 "\n", getGlobalMemorySize());
+  DP("-- Cache size (bytes)           : %" PRIu64 "\n", getCacheSize());
+  DP("-- Max clock frequency (MHz)    : %" PRIu32 "\n", getClockRate());
+}
+
+Error L0DeviceTy::internalInit() {
+  const auto &Options = getPlugin().getOptions();
+
+  uint32_t Count = 1;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count,
+                    &MemoryProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count,
+                    &CacheProperties);
+
+  DeviceName =
+      std::string(DeviceProperties.name, sizeof(DeviceProperties.name));
+
+  DP("Found a GPU device, Name = %s\n", DeviceProperties.name);
+
+  DeviceArch = computeArch();
+  // Default allocation kind for this device
+  AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED;
+
+  ze_kernel_indirect_access_flags_t Flags =
+      (AllocKind == TARGET_ALLOC_DEVICE)
+          ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
+          : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+  IndirectAccessFlags = Flags;
+
+  // Get the UUID
+  std::string uid = "";
+  for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++)
+    uid += std::to_string(DeviceProperties.uuid.id[n]);
+  DeviceUuid = std::move(uid);
+
+  ComputeOrdinal = findComputeOrdinal();
+
+  CopyOrdinal = findCopyOrdinal();
+
+  LinkCopyOrdinal = findCopyOrdinal(true);
+  IsAsyncEnabled =
+      isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
+  MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+  l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                bool ReleaseQueue) {
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (!IsAsync)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+
+  if (!AsyncQueue->WaitEvents.empty()) {
+    const auto &WaitEvents = AsyncQueue->WaitEvents;
+    if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
+      // Only need to wait for the last event
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+      // Synchronize on kernel event to support printf()
+      auto KE = AsyncQueue->KernelEvent;
+      if (KE && KE != WaitEvents.back()) {
+        CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+      }
+      for (auto &Event : WaitEvents) {
+        releaseEvent(Event);
+      }
+    } else { // Async
+      // Wait for all events. We should wait and reset events in reverse order
+      // to avoid premature event reset. If we have a kernel event in the
+      // queue, it is the last event to wait for since all wait events of the
+      // kernel are signaled before the kernel is invoked. We always invoke
+      // synchronization on kernel event to support printf().
+      bool WaitDone = false;
+      for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
+        if (!WaitDone) {
+          CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+          if (*Itr == AsyncQueue->KernelEvent)
+            WaitDone = true;
+        }
+        releaseEvent(*Itr);
+      }
+    }
+  }
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  if (ReleaseQueue) {
+    Plugin.releaseAsyncQueue(AsyncQueue);
+    getStagingBuffer().reset();
+    AsyncInfo->Queue = nullptr;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                               __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  const auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
+      TgtPtrType == ZE_MEMORY_TYPE_HOST) {
+    std::copy_n(static_cast<const char *>(HstPtr), Size,
+                static_cast<char *>(TgtPtr));
+  } else {
+    const void *SrcPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      SrcPtr = getStagingBuffer().get(IsAsync);
+      std::copy_n(static_cast<const char *>(HstPtr), Size,
+                  static_cast<char *>(const_cast<void *>(SrcPtr)));
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
+    else
+      RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
+       DPxPTR(TgtPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                                 __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = Plugin.getAsyncQueue();
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
+  auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
+      TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
+    bool CopyNow = true;
+    if (IsAsync) {
+      if (AsyncQueue->KernelEvent) {
+        // Delay Host/Shared USM to host memory copy since it must wait for
+        // kernel completion.
+        AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size);
+        CopyNow = false;
+      }
+    }
+    if (CopyNow) {
+      std::copy_n(static_cast<const char *>(TgtPtr), Size,
+                  static_cast<char *>(HstPtr));
+    }
+  } else {
+    void *DstPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <=
+            getPlugin().getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      DstPtr = getStagingBuffer().get(IsAsync);
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+                               /* CopyTo */ false);
+    else
+      RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+    if (DstPtr != HstPtr) {
+      if (IsAsync) {
+        // Store delayed H2M data copies
+        auto &H2MList = AsyncQueue->H2MList;
+        H2MList.emplace_back(DstPtr, HstPtr, static_cast<size_t>(Size));
+      } else {
+        std::copy_n(static_cast<char *>(DstPtr), Size,
+                    static_cast<char *>(HstPtr));
+      }
+    }
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
+       DPxPTR(HstPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
+                           int32_t ImageId) {
+  auto *PGM = getProgramFromImage(TgtImage);
+  if (PGM) {
+    // Program already exists
+    return PGM;
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+       DPxPTR(TgtImage->ImageStart));
+
+  const size_t NumEntries =
+      (size_t)(TgtImage->EntriesEnd - TgtImage->EntriesBegin);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Expecting to have %zu entries defined\n", NumEntries);
+  (void)NumEntries; // silence warning
+
+  const auto &Options = getPlugin().getOptions();
+  std::string CompilationOptions(Options.CompilationOptions + " " +
+                                 Options.UserCompilationOptions);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+  CompilationOptions += " " + Options.InternalCompilationOptions;
+  auto &Program = addProgram(ImageId, TgtImage);
+
+  int32_t RC = Program.buildModules(CompilationOptions);
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildModules %d", RC);
+
+  RC = Program.linkModules();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in linkModules %d", RC);
+
+  RC = Program.loadModuleKernels();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildKernels %d", RC);
+
+  return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+  // Ignoring for now
+  // TODO: call properly L0Program unload
+  return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                  bool ReleaseQueue) {
+  if (!ReleaseQueue) {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Support for ReleaseQueue=false in %s"
+                         " not implemented yet\n",
+                         __func__);
+  }
+  int32_t RC = synchronize(&AsyncInfo);
+  return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return false;
+
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (AsyncQueue->WaitEvents.empty())
+    return false;
+
+  return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return Plugin::success();
+
+  auto &Plugin = getPlugin();
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (!AsyncQueue->WaitEvents.empty())
+    return Plugin::success();
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  Plugin.releaseAsyncQueue(AsyncQueue);
+  getStagingBuffer().reset();
+  AsyncInfo.Queue = nullptr;
+
+  return Plugin::success();
+}
+
+void *L0DeviceTy::allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) {
+  return dataAlloc(Size, /*Align=*/0, Kind,
+                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+                   /*DevMalloc=*/false);
+}
+
+int L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+  return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
+}
+
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                   int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
+}
+
+Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                   void *DstPtr, int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+
+  L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev);
+  // Use copy engine only for across-tile/device copies.
+  const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
+
+  if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
+    if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+                            (__tgt_async_info *)AsyncInfoWrapper))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  } else {
+    if (enqueueMemCopy(DstPtr, SrcPtr, Size,
+                       /* AsyncInfo */ nullptr,
+                       /* Locked */ false, UseCopyEngine))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs<AsyncQueueTy *>();
+  if (!Queue) {
+    Queue = getPlugin().getAsyncQueue();
+    AsyncInfoWrapper.setQueueAs<AsyncQueueTy *>(Queue);
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
+  if (!Info->Context)
+    Info->Context = getZeContext();
+  if (!Info->Device)
+    Info->Device = reinterpret_cast<void *>(getZeDevice());
+  return Plugin::success();
+}
+
+Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
+  InfoTreeNode Info;
+  Info.add("Device Number", getDeviceId());
+  Info.add("Device Name", getNameCStr());
+  Info.add("Device PCI ID", getPCIId());
+  Info.add("Device UUID", getUuid().c_str());
+  Info.add("Number of total EUs", getNumEUs());
+  Info.add("Number of threads per EU", getNumThreadsPerEU());
+  Info.add("EU SIMD width", getSIMDWidth());
+  Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
+  Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
+  Info.add("Number of slices", getNumSlices());
+  Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
+  Info.add("Global memory size (bytes)", getGlobalMemorySize());
+  Info.add("Cache size (bytes)", getCacheSize());
+  Info.add("Max clock frequency (MHz)", getClockRate());
+  return Info;
+}
+
+Expected<GenericKernelTy &> L0DeviceTy::constructKernel(const char *Name) {
+  // Allocate and construct the L0 kernel.
+  L0KernelTy *L0Kernel = getPlugin().allocate<L0KernelTy>();
+  if (!L0Kernel)
+    return Plugin::error(ErrorCode::UNKNOWN,
+                         "Failed to allocate memory for L0 kernel");
+
+  new (L0Kernel) L0KernelTy(Name);
+
+  return *L0Kernel;
+}
+
+uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
+  ze_memory_allocation_properties_t properties = {
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+      nullptr,                // extension
+      ZE_MEMORY_TYPE_UNKNOWN, // type
+      0,                      // id
+      0,                      // page size
+  };
+
+  ze_result_t rc;
+  CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties,
+          nullptr);
+
+  if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT)
+    return ZE_MEMORY_TYPE_UNKNOWN;
+  else
+    return properties.type;
+}
+
+interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
+                                                   int32_t NumPrefers,
+                                                   interop_spec_t *Prefers) {
+  // no supported preference found, set default to level_zero, non-ordered
+  return interop_spec_t{
+      tgt_fr_level_zero, {forceInorderInterop() /*inorder*/, 0}, 0};
+}
+
+Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
+                                                 interop_spec_t &InteropSpec) {
+  auto Ret =
+      new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext);
+  Ret->fr_id = tgt_fr_level_zero;
+  Ret->vendor_id = omp_vendor_intel;
+
+  if (InteropContext == kmp_interop_type_target ||
+      InteropContext == kmp_interop_type_targetsync) {
+    Ret->device_info.Platform = getZeDriver();
+    Ret->device_info.Device = getZeDevice();
+    Ret->device_info.Context = getZeContext();
+  }
+
+  Ret->rtl_property = new L0Interop::Property();
+  if (InteropContext == kmp_interop_type_targetsync) {
+    Ret->async_info = new __tgt_async_info();
+    auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
+
+    bool InOrder = InteropSpec.attrs.inorder;
+    Ret->attrs.inorder = InOrder;
+    if (useImmForInterop()) {
+      auto CmdList = createImmCmdList(InOrder);
+      Ret->async_info->Queue = CmdList;
+      L0->ImmCmdList = CmdList;
+    } else {
+      Ret->async_info->Queue = createCommandQueue(InOrder);
+      L0->CommandQueue =
+          static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
+    }
+  }
+
+  return Ret;
+}
+
+Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
+  const auto DeviceId = getDeviceId();
+
+  if (!Interop || Interop->device_id != (intptr_t)DeviceId) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  if (Interop->async_info && Interop->async_info->Queue) {
+    if (useImmForInterop()) {
+      auto ImmCmdList = L0->ImmCmdList;
+      CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList);
+    } else {
+      auto CmdQueue = L0->CommandQueue;
+      CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue);
+    }
+  }
+  delete L0;
+  delete Interop;
+
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                                   __tgt_async_info *AsyncInfo, bool Locked,
+                                   bool UseCopyEngine) {
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  ze_event_handle_t Event = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    if (UseCopyEngine) {
+      CmdList = getCopyCmdList();
+      CmdQueue = getCopyCmdQueue();
+    } else {
+      CmdList = getCmdList();
+      CmdQueue = getCmdQueue();
+    }
+
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    if (Locked) {
+      CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                       nullptr);
+    } else {
+      CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                           CmdQueue, 1, &CmdList, nullptr);
+    }
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue non-blocking memory copy. This function is invoked only when IMM is
+/// fully enabled and async mode is requested.
+int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                                        __tgt_async_info *AsyncInfo,
+                                        bool CopyTo) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+  ze_event_handle_t SignalEvent = getEvent();
+  size_t NumWaitEvents = 0;
+  ze_event_handle_t *WaitEvents = nullptr;
+  AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
+  if (!AsyncQueue->WaitEvents.empty()) {
+    // Use a single wait event if events are ordered or a kernel event exists.
+    NumWaitEvents = 1;
+    if (Ordered)
+      WaitEvents = &AsyncQueue->WaitEvents.back();
+    else if (AsyncQueue->KernelEvent)
+      WaitEvents = &AsyncQueue->KernelEvent;
+    else
+      NumWaitEvents = 0;
+  }
+  auto CmdList = getImmCopyCmdList();
+  CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                   SignalEvent, NumWaitEvents, WaitEvents);
+  AsyncQueue->WaitEvents.push_back(SignalEvent);
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue memory fill
+int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+                                   size_t PatternSize, size_t Size) {
+  if (useImmForCopy()) {
+    const auto CmdList = getImmCopyCmdList();
+    auto Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    auto CmdList = getCopyCmdList();
+    const auto CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                     nullptr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                               int64_t PatternSize, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  // TODO: support async version
+  // TODO: convert enqueueMemFill to return Error code
+  if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
+    return Plugin::success();
+
+  return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+}
+
+void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+
+  const bool UseDedicatedPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  if (Kind == TARGET_ALLOC_DEFAULT) {
+    if (UserAlloc)
+      Kind = TARGET_ALLOC_DEVICE;
+    else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM)
+      Kind = TARGET_ALLOC_HOST;
+    else if (UseDedicatedPool)
+      Kind = TARGET_ALLOC_DEVICE;
+    else
+      Kind = getAllocKind();
+  }
+  auto &Allocator = getMemAllocator(Kind);
+  return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+                         MemAdvice, AllocOpt);
+}
+
+int32_t L0DeviceTy::dataDelete(void *Ptr) {
+  auto &Allocator = getMemAllocator(Ptr);
+  return Allocator.dealloc(Ptr);
+}
+
+int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+  ze_result_t RC;
+  CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
+          Size);
+  if (RC != ZE_RESULT_SUCCESS) {
+    DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
+       ".\n",
+       DPxPTR(Mem), DPxPTR(getZeDevice()));
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Command queues related functions
+/// Create a command list with given ordinal and flags
+ze_command_list_handle_t L0DeviceTy::createCmdList(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    ze_command_list_flags_t Flags, const std::string &DeviceIdStr) {
+  ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+                                        nullptr, // extension
+                                        Ordinal, Flags};
+  ze_command_list_handle_t cmdList;
+  CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
+                   &cmdList);
+  DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
+     ") for device %s.\n",
+     DPxPTR(cmdList), Ordinal, DeviceIdStr.c_str());
+  return cmdList;
+}
+
+/// Create a command list with default flags
+ze_command_list_handle_t
+L0DeviceTy::createCmdList(ze_context_handle_t Context,
+                          ze_device_handle_t Device, uint32_t Ordinal,
+                          const std::string &DeviceIdStr) {
+  return (Ordinal == UINT32_MAX)
+             ? nullptr
+             : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
+}
+
+ze_command_list_handle_t L0DeviceTy::getCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getCmdList();
+  if (!CmdList) {
+    CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
+                            getZeId());
+    TLS.setCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+/// Create a command queue with given ordinal and flags
+ze_command_queue_handle_t
+L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
+                           ze_device_handle_t Device, uint32_t Ordinal,
+                           uint32_t Index, ze_command_queue_flags_t Flags,
+                           const std::string &DeviceIdStr) {
+  ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                                          nullptr, // extension
+                                          Ordinal,
+                                          Index,
+                                          Flags, // flags
+                                          ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                                          ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_queue_handle_t cmdQueue;
+  CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+                   &cmdQueue);
+  DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
+     ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.c_str());
+  return cmdQueue;
+}
+
+/// Create a command queue with default flags
+ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    uint32_t Index, const std::string &DeviceIdStr, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  return (Ordinal == UINT32_MAX) ? nullptr
+                                 : createCmdQueue(Context, Device, Ordinal,
+                                                  Index, Flags, DeviceIdStr);
+}
+
+/// Create a new command queue for the given OpenMP device ID
+ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+  auto cmdQueue =
+      createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
+                     getComputeIndex(), getZeId(), InOrder);
+  return cmdQueue;
+}
+
+/// Create an immediate command list
+ze_command_list_handle_t
+L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                               nullptr,
+                               Ordinal,
+                               Index,
+                               Flags,
+                               ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                               ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_list_handle_t CmdList = nullptr;
+  CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+                   &Desc, &CmdList);
+  DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
+     ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
+  return CmdList;
+}
+
+/// Create an immediate command list for copying
+ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+  uint32_t Ordinal = getMainCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getLinkCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getComputeEngine();
+  return createImmCmdList(Ordinal, /*Index*/ 0);
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+  auto &TLS = getTLS();
+  auto CmdQueue = TLS.getCmdQueue();
+  if (!CmdQueue) {
+    CmdQueue = createCommandQueue();
+    TLS.setCmdQueue(CmdQueue);
+  }
+  return CmdQueue;
+}
+
+ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getCopyCmdList();
+    if (!CmdList) {
+      CmdList = createCmdList(getZeContext(), getZeDevice(),
+                              getMainCopyEngine(), getZeId());
+      TLS.setCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getCopyCmdQueue();
+    if (!CmdQueue) {
+      CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
+                                getMainCopyEngine(), 0, getZeId());
+      TLS.setCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getLinkCopyCmdList();
+    if (!CmdList) {
+      CmdList =
+          createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                        ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getLinkCopyCmdQueue();
+    if (!CmdQueue) {
+      // Try to use different copy engines for multiple threads
+      uint32_t Index =
+          __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
+      CmdQueue =
+          createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                         Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCmdList();
+  if (!CmdList) {
+    CmdList = createImmCmdList();
+    TLS.setImmCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCopyCmdList();
+  if (!CmdList) {
+    CmdList = createImmCopyCmdList();
+    TLS.setImmCopyCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+
+  // Nothing to do if everything is ordered
+  if (Ordered)
+    return Plugin::success();
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = getImmCopyCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+  } else {
+    CmdList = getCopyCmdList();
+    CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
new file mode 100644
index 0000000000000..06f01f23285fc
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -0,0 +1,134 @@
+//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===//
+//
+// Implement wrapper for level_zero API calls through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <memory>
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(zeInit, 1)
+DLWRAP(zeDriverGet, 2)
+DLWRAP(zeDeviceGet, 3)
+DLWRAP(zeDeviceGetSubDevices, 3)
+DLWRAP(zeModuleCreate, 5)
+DLWRAP(zeModuleGetProperties, 2)
+DLWRAP(zeModuleBuildLogDestroy, 1)
+DLWRAP(zeModuleBuildLogGetString, 3)
+DLWRAP(zeModuleGetKernelNames, 3)
+DLWRAP(zeModuleDestroy, 1)
+DLWRAP(zeCommandListAppendBarrier, 4)
+DLWRAP(zeCommandListAppendLaunchKernel, 6)
+DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6)
+DLWRAP(zeCommandListAppendMemoryCopy, 7)
+DLWRAP(zeCommandListAppendMemoryCopyRegion, 12)
+DLWRAP(zeCommandListAppendMemoryFill, 8)
+DLWRAP(zeCommandListAppendMemoryPrefetch, 3)
+DLWRAP(zeCommandListAppendMemAdvise, 5)
+DLWRAP(zeCommandListClose, 1)
+DLWRAP(zeCommandListCreate, 4)
+DLWRAP(zeCommandListCreateImmediate, 4)
+DLWRAP(zeCommandListDestroy, 1)
+DLWRAP(zeCommandListReset, 1)
+DLWRAP(zeCommandQueueCreate, 4)
+DLWRAP(zeCommandQueueDestroy, 1)
+DLWRAP(zeCommandQueueExecuteCommandLists, 4)
+DLWRAP(zeCommandQueueSynchronize, 2)
+DLWRAP(zeContextCreate, 3)
+DLWRAP(zeContextDestroy, 1)
+DLWRAP(zeContextMakeMemoryResident, 4)
+DLWRAP(zeDeviceCanAccessPeer, 3)
+DLWRAP(zeDeviceGetProperties, 2)
+DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3)
+DLWRAP(zeDeviceGetComputeProperties, 2)
+DLWRAP(zeDeviceGetMemoryProperties, 3)
+DLWRAP(zeDeviceGetCacheProperties, 3)
+DLWRAP(zeDeviceGetGlobalTimestamps, 3)
+DLWRAP(zeDriverGetApiVersion, 2)
+DLWRAP(zeDriverGetExtensionFunctionAddress, 3)
+DLWRAP(zeDriverGetExtensionProperties, 3)
+DLWRAP(zeEventCreate, 3)
+DLWRAP(zeEventDestroy, 1)
+DLWRAP(zeEventHostReset, 1)
+DLWRAP(zeEventHostSynchronize, 2)
+DLWRAP(zeEventPoolCreate, 5)
+DLWRAP(zeEventPoolDestroy, 1)
+DLWRAP(zeEventQueryKernelTimestamp, 2)
+DLWRAP(zeFenceCreate, 3)
+DLWRAP(zeFenceDestroy, 1)
+DLWRAP(zeFenceHostSynchronize, 2)
+DLWRAP(zeKernelCreate, 3)
+DLWRAP(zeKernelDestroy, 1)
+DLWRAP(zeKernelGetName, 3)
+DLWRAP(zeKernelGetProperties, 2)
+DLWRAP(zeKernelSetArgumentValue, 4)
+DLWRAP(zeKernelSetGroupSize, 4)
+DLWRAP(zeKernelSetIndirectAccess, 2)
+DLWRAP(zeKernelSuggestGroupSize, 7)
+DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2)
+DLWRAP(zeMemAllocDevice, 6)
+DLWRAP(zeMemAllocHost, 5)
+DLWRAP(zeMemAllocShared, 7)
+DLWRAP(zeMemFree, 2)
+DLWRAP(zeMemGetAddressRange, 4)
+DLWRAP(zeMemGetAllocProperties, 4)
+DLWRAP(zeModuleDynamicLink, 3)
+DLWRAP(zeModuleGetGlobalPointer, 4)
+DLWRAP(zesDeviceEnumMemoryModules, 3)
+DLWRAP(zesMemoryGetState, 2)
+
+DLWRAP_FINALIZE()
+
+#ifndef LEVEL_ZERO_LIBRARY
+#error "Level zero library not defined"
+#endif
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadLevelZero() {
+  const char *L0Library = LEVEL_ZERO_LIBRARY;
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", L0Library);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, L0Library);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) {
+  if (!loadLevelZero())
+    return ZE_RESULT_ERROR_UNKNOWN;
+  return dlwrap_zeInit(flags);
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
new file mode 100644
index 0000000000000..d1cb0b7bd50bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                             KernelArgsTy &KernelArgs,
+                             KernelLaunchParamsTy LaunchParams,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+  int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+                                   std::move(LaunchParams), AsyncInfoWrapper);
+  if (RC == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  return Plugin::error(error::ErrorCode::UNKNOWN,
+                       "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+  const auto *KernelName = getName();
+
+  auto Module = Program.findModuleFromKernelName(KernelName);
+  ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+                                 KernelName};
+  CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+  return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+                           DeviceImageTy &Image) {
+  auto &Program = L0ProgramTy::makeL0Program(Image);
+
+  Error Err = buildKernel(Program);
+  if (Err)
+    return Err;
+  Program.addKernel(this);
+
+  return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+  int ThrLimit;
+  ThrLimit = omp_get_teams_thread_limit();
+  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+  // omp_get_thread_limit() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t ThreadLimit =
+      (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+          ? ThrLimit
+          : 0;
+
+  int NTeams = omp_get_max_teams();
+  DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+  // omp_get_max_teams() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t NumTeams =
+      (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+                                                                      : 0;
+
+  return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+    TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+    ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool IsTeamsNDRange) const {
+
+  const KernelPropertiesTy &KernelPR = getProperties();
+
+  const auto DeviceId = Device.getDeviceId();
+  bool MaxGroupSizeForced = false;
+  bool MaxGroupCountForced = false;
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+  const auto &Option = LevelZeroPluginTy::getOptions();
+  const auto OptSubscRate = Option.SubscriptionRate;
+
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+  if (KernelMaxThreadGroupSize < MaxGroupSize) {
+    MaxGroupSize = KernelMaxThreadGroupSize;
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Capping maximum team size to %" PRIu32
+         " due to kernel constraints.\n",
+         MaxGroupSize);
+  }
+
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t MaxGroupCount = 0;
+  if (NumTeams > 0) {
+    MaxGroupCount = NumTeams;
+    MaxGroupCountForced = true;
+  }
+
+  if (MaxGroupCountForced) {
+    // If number of teams is specified by the user, then use KernelWidth
+    // WIs per WG by default, so that it matches
+    // decideLoopKernelGroupArguments() behavior.
+    if (!MaxGroupSizeForced) {
+      MaxGroupSize = KernelWidth;
+    }
+  } else {
+    const uint32_t NumSubslices = Device.getNumSubslices();
+    uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+    if (HalfNumThreads)
+      NumThreadsPerSubslice /= 2;
+
+    MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+    if (MaxGroupSizeForced) {
+      // Set group size for the HW capacity
+      uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+      uint32_t NumGroupsPerSubslice =
+          (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+      MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+    } else {
+      assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+      assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+             "Invalid maxGroupSize");
+      // Maximize group size
+      while (MaxGroupSize >= KernelWidth) {
+        uint32_t NumThreadsPerGroup =
+            (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+        if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+          uint32_t NumGroupsPerSubslice =
+              NumThreadsPerSubslice / NumThreadsPerGroup;
+          MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+          break;
+        }
+        MaxGroupSize -= KernelWidth;
+      }
+    }
+  }
+
+  uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  bool UsedReductionSubscriptionRate = false;
+  if (!MaxGroupCountForced) {
+    { GRPCounts[0] *= OptSubscRate; }
+
+    size_t LoopTripcount = 0;
+    if (LoopLevels) {
+      // TODO: consider other possible LoopDesc uses
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Loop desciptor provided but specific ND-range is disabled\n");
+      // TODO: get rid of this constraint
+      if (LoopLevels->NumLoops > 1) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+             LoopLevels->NumLoops);
+      } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+        LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+                         LoopLevels->Levels[0].Stride) /
+                        LoopLevels->Levels[0].Stride;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+             " = %zu\n",
+             LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+             LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+             LoopTripcount);
+      }
+    }
+
+    if (LoopTripcount && !UsedReductionSubscriptionRate) {
+      const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+                                     Device.getNumSubslices() * SIMDWidth;
+      size_t AdjustedGroupCount =
+          IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+                                      MaxTotalThreads / GRPSizes[0])
+                         : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+      AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+      AdjustedGroupCount *= OptSubscRate;
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Adjusting number of teams using the loop tripcount\n");
+      if (AdjustedGroupCount < GRPCounts[0])
+        GRPCounts[0] = AdjustedGroupCount;
+    }
+  }
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+                                     const uint32_t (&GroupSizes)[3],
+                                     uint32_t SIMDWidth) {
+  uint64_t GroupCount[3];
+  for (int I = 0; I < 3; ++I) {
+    if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[I] =
+        (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+    if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+      return (std::numeric_limits<uint64_t>::max)();
+  }
+  for (int I = 1; I < 3; ++I) {
+    if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[0] *= GroupCount[I];
+  }
+  // Multiplication of the group sizes must never overflow uint64_t
+  // for any existing device.
+  uint64_t LocalWorkSize =
+      uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+  uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+  // Check that the total number of threads fits uint64_t.
+  if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+    return (std::numeric_limits<uint64_t>::max)();
+
+  return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+    uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool &AllowCooperative) const {
+
+  const auto DeviceId = Device.getDeviceId();
+  const auto &Options = LevelZeroPluginTy::getOptions();
+  const auto &KernelPR = getProperties();
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+  bool MaxGroupSizeForced = false;
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t GRPCounts[3] = {1, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  TgtLoopDescTy *Levels = LoopLevels->Levels;
+  int32_t DistributeDim = LoopLevels->DistributeDim;
+  assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+         "Invalid distribute dimension.");
+  int32_t NumLoops = LoopLevels->NumLoops;
+  assert((NumLoops > 0 && NumLoops <= 3) &&
+         "Invalid loop nest description for ND partitioning");
+
+  // Compute global widths for X/Y/Z dimensions.
+  size_t TripCounts[3] = {1, 1, 1};
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+         ", Stride = %" PRId64 "\n",
+         I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+    if (Levels[I].Ub < Levels[I].Lb)
+      TripCounts[I] = 0;
+    else
+      TripCounts[I] =
+          (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+  }
+
+  // Check if any of the loop has zero iterations.
+  if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+    std::fill(GroupSizes, GroupSizes + 3, 1);
+    std::fill(GRPCounts, GRPCounts + 3, 1);
+    if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+      // There is a distribute dimension, and the distribute loop
+      // has non-zero iterations, but some inner parallel loop
+      // has zero iterations. We still want to split the distribute
+      // loop's iterations between many WGs (of size 1), but the inner/lower
+      // dimensions should be 1x1.
+      // Note that this code is currently dead, because we are not
+      // hoisting the inner loops' bounds outside of the target regions.
+      // The code is here just for completeness.
+      size_t DistributeTripCount = TripCounts[DistributeDim];
+      if (DistributeTripCount > UINT32_MAX) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Invalid number of teams %zu due to large loop trip count\n",
+             DistributeTripCount);
+        return OFFLOAD_FAIL;
+      }
+      GRPCounts[DistributeDim] = DistributeTripCount;
+    }
+    AllowCooperative = false;
+    GroupCounts.groupCountX = GRPCounts[0];
+    GroupCounts.groupCountY = GRPCounts[1];
+    GroupCounts.groupCountZ = GRPCounts[2];
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (!MaxGroupSizeForced) {
+    // Use zeKernelSuggestGroupSize to compute group sizes,
+    // or fallback to setting dimension 0 width to SIMDWidth.
+    // Note that in case of user-specified LWS GRPSizes[0]
+    // is already set according to the specified value.
+    size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+    if (DistributeDim > 0) {
+      // There is a distribute dimension.
+      GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+      GlobalSizes[DistributeDim] = 1;
+    }
+
+    {
+      if (MaxGroupSize > KernelPR.Width) {
+        GRPSizes[0] = KernelPR.Width;
+      }
+      if (DistributeDim == 0) {
+        // If there is a distribute dimension, then we do not use
+        // thin HW threads, since we do not know anything about
+        // the iteration space of the inner parallel loop regions.
+        //
+        // If there is no distribute dimension, then try to use thiner
+        // HW threads to get more independent HW threads executing
+        // the kernel - this may allow more parallelism due to
+        // the stalls being distributed across multiple HW threads rather
+        // than across SIMD lanes within one HW thread.
+        assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+               "Unexpected team sizes for dimensions 1 or/and 2.");
+        uint32_t SimdWidth = KernelPR.SIMDWidth;
+        uint64_t TotalThreads = Device.getTotalThreads();
+        TotalThreads *= Options.ThinThreadsThreshold;
+
+        uint32_t GRPSizePrev = GRPSizes[0];
+        uint64_t ThreadsNeeded =
+            computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        while (ThreadsNeeded < TotalThreads) {
+          GRPSizePrev = GRPSizes[0];
+          // Try to half the local work size (if possible) and see
+          // how many HW threads the kernel will require with this
+          // new local work size.
+          // In most implementations the initial GRPSizes[0]
+          // will be a power-of-two.
+          if (GRPSizes[0] <= 1)
+            break;
+          GRPSizes[0] >>= 1;
+          ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        }
+        GRPSizes[0] = GRPSizePrev;
+      }
+    }
+  }
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    if (I < DistributeDim) {
+      GRPCounts[I] = 1;
+      continue;
+    }
+    size_t Trip = TripCounts[I];
+    if (GRPSizes[I] >= Trip)
+      GRPSizes[I] = Trip;
+    size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+    if (Count > UINT32_MAX) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Invalid number of teams %zu due to large loop trip count\n", Count);
+      return OFFLOAD_FAIL;
+    }
+    GRPCounts[I] = (uint32_t)Count;
+  }
+  AllowCooperative = false;
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                                   int32_t ThreadLimit, uint32_t *GroupSizes,
+                                   ze_group_count_t &GroupCounts,
+                                   void *LoopDesc,
+                                   bool &AllowCooperative) const {
+
+  const auto SubId = SubDevice.getDeviceId();
+  const auto &KernelPR = getProperties();
+
+  // Detect if we need to reduce available HW threads. We need this adjustment
+  // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  static std::once_flag OnceFlag;
+  static bool ZeDebugEnabled = false;
+  std::call_once(OnceFlag, []() {
+    const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
+    if (EnvVal && std::atoi(EnvVal) == 1)
+      ZeDebugEnabled = true;
+  });
+
+  // Read the most recent global thread limit and max teams.
+  auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+
+  bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+  bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+  assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+  if (ThreadLimit > 0) {
+    // use thread_limit clause value default
+    DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+       ThreadLimit);
+  } else if (ThreadLimitICV > 0) {
+    // else use thread-limit-var ICV
+    ThreadLimit = ThreadLimitICV;
+    DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+  }
+
+  size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+  // Set correct max group size if the kernel was compiled with explicit SIMD
+  if (SIMDWidth == 1) {
+    MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+  }
+
+  if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+    MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+    DP("Capping maximum team size to %zu due to kernel constraints.\n",
+       MaxThreadLimit);
+  }
+
+  if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+    ThreadLimit = MaxThreadLimit;
+    DP("Max team size execceds current maximum %zu. Adjusted\n",
+       MaxThreadLimit);
+  }
+  {
+    if (NumTeams > 0) {
+      DP("Number of teams is set to %" PRId32
+         "(num_teams clause or no teams construct)\n",
+         NumTeams);
+    } else if (NumTeamsICV > 0) {
+      // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+           "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+      NumTeams = NumTeamsICV;
+      DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+         NumTeams);
+    }
+
+    bool UseLoopTC = LoopDesc;
+    decideKernelGroupArguments(
+        SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+        UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+        GroupCounts, HalfNumThreads, false);
+    AllowCooperative = false;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+                                        KernelArgsTy &KernelArgs,
+                                        KernelLaunchParamsTy LaunchParams,
+                                        __tgt_async_info *AsyncInfo) const {
+  // Libomptarget can pass negative NumTeams and ThreadLimit now after
+  // introducing __tgt_target_kernel. This happens only when we have valid
+  // LoopDesc and the region is not a teams region.
+
+  auto zeKernel = getZeKernel();
+  auto DeviceId = l0Device.getDeviceId();
+  int32_t NumArgs = KernelArgs.NumArgs;
+  int32_t NumTeams = KernelArgs.NumTeams[0];
+  int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+  void *LoopDesc = nullptr;
+
+  if (NumTeams < 0)
+    NumTeams = 0;
+  if (ThreadLimit < 0)
+    ThreadLimit = 0;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+  auto &Plugin = l0Device.getPlugin();
+  auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+  auto *IdStr = Device.getZeIdCStr();
+  auto &Options = LevelZeroPluginTy::getOptions();
+  bool IsAsync = AsyncInfo && Device.asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto *AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+  // We need to get a non-const version of the Properties structure in order to
+  // use its lock and be able to cache the group params and indirect flags
+  auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+  // Protect from kernel preparation to submission as kernels are shared.
+  std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+  // Decide group sizes and counts
+  uint32_t GroupSizes[3];
+  ze_group_count_t GroupCounts;
+
+  bool AllowCooperative = false;
+
+  // Check if we can reuse previous group parameters
+  bool GroupParamsReused = KernelPR.reuseGroupParams(
+      static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+      GroupSizes, GroupCounts, AllowCooperative);
+
+  if (!GroupParamsReused) {
+    auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+                             GroupCounts, LoopDesc, AllowCooperative);
+
+    if (RC != OFFLOAD_SUCCESS) {
+      return RC;
+    }
+
+    KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+                              NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+                              AllowCooperative);
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+       GroupSizes[1], GroupSizes[2]);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+       GroupCounts.groupCountX, GroupCounts.groupCountY,
+       GroupCounts.groupCountZ);
+  for (int32_t I = 0; I < NumArgs; I++) {
+    {
+      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+      CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+                       Arg == nullptr ? nullptr : &Arg);
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+           ") was set successfully for device %s.\n",
+           I, DPxPTR(Arg), IdStr);
+    }
+  }
+
+  // Set Kernel Indirect flags
+  auto &PrevFlags = KernelPR.IndirectAccessFlags;
+  ze_kernel_indirect_access_flags_t Flags = 0;
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+  if (PrevFlags != Flags) {
+    // Combine with common access flags
+    const auto FinalFlags = Device.getIndirectFlags() | Flags;
+    CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+    PrevFlags = Flags;
+  }
+
+  if (!GroupParamsReused) {
+    CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+                     GroupSizes[1], GroupSizes[2]);
+  }
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  const bool UseImmCmdList = Device.useImmForCompute();
+
+  if (UseImmCmdList) {
+    CmdList = Device.getImmCmdList();
+    // Command queue is not used with immediate command list
+  } else {
+    CmdList = Device.getCmdList();
+    CmdQueue = Device.getCmdQueue();
+  }
+
+  if (UseImmCmdList) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Using immediate command list for kernel submission.\n");
+    auto Event = Device.getEvent();
+    size_t NumWaitEvents = 0;
+    ze_event_handle_t *WaitEvents = nullptr;
+    if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+      if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+        NumWaitEvents = 1;
+        WaitEvents = &AsyncQueue->WaitEvents.back();
+      } else {
+        NumWaitEvents = AsyncQueue->WaitEvents.size();
+        WaitEvents = AsyncQueue->WaitEvents.data();
+      }
+    }
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, NumWaitEvents,
+                       WaitEvents);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, NumWaitEvents, WaitEvents);
+    KernelLock.unlock();
+    if (IsAsync) {
+      AsyncQueue->WaitEvents.push_back(Event);
+      AsyncQueue->KernelEvent = Event;
+    } else {
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+      Device.releaseEvent(Event);
+    }
+  } else {
+    ze_event_handle_t Event = nullptr;
+    KernelLock.unlock();
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    if (Event) {
+      Device.releaseEvent(Event);
+    }
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
new file mode 100644
index 0000000000000..790acdd9f568f
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -0,0 +1,637 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+  if (isFull())
+    return nullptr;
+  if (FreeSlot != UINT32_MAX) {
+    const uint32_t Slot = FreeSlot;
+    FreeSlot = UINT32_MAX;
+    UsedSlots[Slot] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+  }
+  for (uint32_t I = 0; I < NumSlots; I++) {
+    if (UsedSlots[I])
+      continue;
+    UsedSlots[I] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + I * ChunkSize);
+  }
+  // Should not reach here.
+  assert(0 && "Inconsistent memory pool state");
+  return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+  if (!contains(Mem))
+    assert(0 && "Inconsistent memory pool state");
+  const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+  UsedSlots[Slot] = false;
+  NumUsedSlots--;
+  FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = Kind;
+  Allocator = _Allocator;
+
+  // Read user-defined options
+  const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+  const size_t UserAllocMax = UserOptions[0];
+  const size_t UserCapacity = UserOptions[1];
+  const size_t UserPoolSize = UserOptions[2];
+
+  BlockCapacity = UserCapacity;
+  PoolSizeMax = UserPoolSize << 20; // MB to B
+  PoolSize = 0;
+
+  auto Context = Allocator->L0Context->getZeContext();
+  const auto Device = Allocator->Device;
+
+  // Check page size used for this allocation kind to decide minimum
+  // allocation size when allocating from L0.
+  void *Mem = Allocator->allocL0(8, 0, AllocKind);
+  ze_memory_allocation_properties_t AP{
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+      ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+  CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+  AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+  CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+  bool IsDiscrete = false;
+  if (Device) {
+    ze_device_properties_t Properties{};
+    Properties.deviceId = 0;
+    Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    Properties.pNext = nullptr;
+    CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+    IsDiscrete = Device->isDiscreteDevice();
+
+    if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+      // Use page size as minimum chunk size for USM shared on discrete
+      // device.
+      // FIXME: pageSize is not returned correctly (=0) on some new devices,
+      //        so use fallback value for now.
+      AllocMin = (std::max)(AP.pageSize, AllocUnit);
+      AllocUnit = AllocMin * BlockCapacity;
+    }
+  }
+
+  // Convert MB to B and round up to power of 2
+  AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+  if (AllocMin >= AllocMax) {
+    AllocMax = 2 * AllocMin;
+    DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+       "requirements.\n",
+       AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+  }
+  assert(AllocMin < AllocMax &&
+         "Invalid parameters while initializing memory pool");
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+
+  // Set bucket parameters
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    size_t BlockSize = ChunkSize * BlockCapacity;
+    // On discrete device, the cost of native L0 invocation doubles when the
+    // the requested size doubles after certain threshold, so allocating
+    // larger block does not pay off at all. It is better to keep a single
+    // chunk in a single block in such cases.
+    if (BlockSize <= AllocUnit) {
+      BlockSize = AllocUnit; // Allocation unit is already large enough
+    } else if (IsDiscrete) {
+      // Do not preallocate if it does not pay off
+      if (ChunkSize >= L0UsmPreAllocThreshold ||
+          (AllocKind == TARGET_ALLOC_HOST &&
+           ChunkSize >= L0HostUsmPreAllocThreshold))
+        BlockSize = ChunkSize;
+    }
+    BucketParams.emplace_back(ChunkSize, BlockSize);
+  }
+
+  DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+     "AllocMax = %zu, "
+     "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+     ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+     BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMin = AllocUnit = 1024 << 6; // 64KB
+  AllocMax = Option.ReductionPoolInfo[0] << 20;
+  BlockCapacity = Option.ReductionPoolInfo[1];
+  PoolSize = 0;
+  PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+  }
+
+  DP("Initialized reduction scratch pool for device " DPxMOD
+     ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMax = AllocMin;
+  BlockCapacity = AllocUnit / AllocMax;
+  PoolSize = 0;
+  PoolSizeMax = (1 << 20); // this should be sufficiently large
+  Buckets.resize(1);
+  BucketStats.resize(1, {0, 0});
+  BucketParams.emplace_back(AllocMax, AllocUnit);
+  ZeroInit = true;
+  ZeroInitValue.resize(AllocUnit, 0);
+  DP("Initialized zero-initialized reduction counter pool for "
+     "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+  auto PrintNum = [](uint64_t Num) {
+    if (Num > 1e9)
+      fprintf(stderr, "%11.2e", float(Num));
+    else
+      fprintf(stderr, "%11" PRIu64, Num);
+  };
+
+  bool HasPoolAlloc = false;
+  for (auto &Stat : BucketStats) {
+    if (Stat.first > 0 || Stat.second > 0) {
+      HasPoolAlloc = true;
+      break;
+    }
+  }
+
+  DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+     DPxPTR(Allocator->Device));
+
+  if (HasPoolAlloc) {
+    DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+       AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+    DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+    for (size_t I = 0; I < Buckets.size(); I++) {
+      const auto &Stat = BucketStats[I];
+      if (Stat.first > 0 || Stat.second > 0) {
+        DP("-- Bucket[%10zu]:", BucketParams[I].first);
+        PrintNum(Stat.first);
+        PrintNum(Stat.second);
+        fprintf(stderr, "%11.2f\n",
+                float(Stat.second) / float(Stat.first + Stat.second) * 100);
+      }
+    }
+  } else {
+    DP("-- Not used\n");
+  }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+  const int DebugLevel = getDebugLevel();
+  if (DebugLevel > 0)
+    printUsage();
+  for (auto &Bucket : Buckets) {
+    for (auto *Block : Bucket) {
+      if (DebugLevel > 0)
+        Allocator->log(0, Block->Size, AllocKind);
+      CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+                       reinterpret_cast<void *>(Block->Base));
+      delete Block;
+    }
+  }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+  if (Size == 0 || Size > AllocMax)
+    return nullptr;
+
+  const uint32_t BucketId = getBucketId(Size);
+  auto &Blocks = Buckets[BucketId];
+  void *Mem = nullptr;
+
+  for (auto *Block : Blocks) {
+    if (Block->isFull())
+      continue;
+    Mem = Block->alloc();
+    assert(Mem && "Inconsistent state while allocating memory from pool");
+    PtrToBlock.emplace(Mem, Block);
+    break;
+  }
+
+  if (Mem == nullptr) {
+    const bool IsSmallAllocatable =
+        (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+    const bool IsFull = (PoolSize > PoolSizeMax);
+    if (IsFull && !IsSmallAllocatable)
+      return nullptr;
+    // Bucket is empty or all blocks in the bucket are full
+    const auto ChunkSize = BucketParams[BucketId].first;
+    const auto BlockSize = BucketParams[BucketId].second;
+    void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+    if (ZeroInit) {
+      auto RC =
+          Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Failed to zero-initialize pool memory\n");
+        return nullptr;
+      }
+    }
+
+    BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
+    Blocks.push_back(Block);
+    Mem = Block->alloc();
+    PtrToBlock.emplace(Mem, Block);
+    if (IsFull)
+      SmallPoolSize += BlockSize;
+    else
+      PoolSize += BlockSize;
+    DP("New block allocation for %s pool: base = " DPxMOD
+       ", size = %zu, pool size = %zu\n",
+       ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+    BucketStats[BucketId].first++;
+  } else {
+    BucketStats[BucketId].second++;
+  }
+
+  AllocSize = (AllocMin << BucketId);
+
+  return Mem;
+}
+
+/// Deallocate the specified memory and returns block size deallocated.
+size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
+  if (PtrToBlock.count(Ptr) == 0)
+    return 0;
+  PtrToBlock[Ptr]->dealloc(Ptr);
+  const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize;
+  PtrToBlock.erase(Ptr);
+  return Deallocated;
+}
+
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
+                                            int32_t Kind, bool InPool,
+                                            bool ImplicitArg) {
+  const auto Inserted =
+      Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+  // Check if we keep valid disjoint memory ranges.
+  [[maybe_unused]] bool Valid = Inserted.second;
+  if (Valid) {
+    if (Inserted.first != Map.begin()) {
+      const auto I = std::prev(Inserted.first, 1);
+      Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+    }
+    if (Valid) {
+      const auto I = std::next(Inserted.first, 1);
+      if (I != Map.end())
+        Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+    }
+  }
+  assert(Valid && "Invalid overlapping memory allocation");
+  if (ImplicitArg)
+    NumImplicitArgs[Kind]++;
+}
+
+/// Remove allocation information for the given memory location
+bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
+                                               MemAllocInfoTy *Removed) {
+  const auto AllocInfo = Map.find(Ptr);
+  if (AllocInfo == Map.end())
+    return false;
+  if (AllocInfo->second.ImplicitArg)
+    NumImplicitArgs[AllocInfo->second.Kind]--;
+  if (Removed)
+    *Removed = AllocInfo->second;
+  Map.erase(AllocInfo);
+  return true;
+}
+
+void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+                                     const L0OptionsTy &Option) {
+  SupportsLargeMem = L0Device.supportsLargeMem();
+  IsHostMem = false;
+  Device = &L0Device;
+  L0Context = &L0Device.getL0Context();
+  for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
+    if (Option.MemPoolInfo.count(Kind) > 0) {
+      std::lock_guard<std::mutex> Lock(Mtx);
+      Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::forward_as_tuple(Kind, this, Option));
+    }
+    if (getDebugLevel() > 0)
+      Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::tuple<>{});
+  }
+  ReductionPool = std::make_unique<MemPoolTy>(this, Option);
+  CounterPool = std::make_unique<MemPoolTy>(this);
+  updateMaxAllocSize(L0Device);
+}
+
+void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+                                  const L0OptionsTy &Option) {
+  SupportsLargeMem = Driver.supportsLargeMem();
+  IsHostMem = true;
+  this->L0Context = &Driver;
+  if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    Pools.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST),
+                  std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+  }
+  if (getDebugLevel() > 0)
+    Stats.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+}
+
+void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
+  // Update the maximum allocation size for this Allocator
+  ze_device_properties_t P;
+  P.maxMemAllocSize = 0;
+  P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  P.pNext = nullptr;
+  CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+
+  if (IsHostMem) {
+    // MaxAllocSize should be the minimum of all devices from the driver
+    if (MaxAllocSize > P.maxMemAllocSize) {
+      MaxAllocSize = P.maxMemAllocSize;
+      DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
+         DPxPTR(L0Context), MaxAllocSize);
+    }
+    return;
+  }
+
+  MaxAllocSize = P.maxMemAllocSize;
+  DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
+     MaxAllocSize);
+}
+
+/// Release resources and report statistics if requested
+void MemAllocatorTy::deinit() {
+  std::lock_guard<std::mutex> Lock(Mtx);
+  // Release RTL-owned memory
+  for (auto *M : MemOwned)
+    dealloc_locked(M);
+  // Release resources used in the pool
+  Pools.clear();
+  ReductionPool.reset(nullptr);
+  CounterPool.reset(nullptr);
+  // Report memory usage if requested
+  if (getDebugLevel() > 0) {
+    for (auto &Stat : Stats) {
+      DP("Memory usage for %s, device " DPxMOD "\n",
+         ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+      const auto &ST = Stat.second;
+      if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+        DP("-- Not used\n");
+        continue;
+      }
+      DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
+      DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
+      DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
+      DP("-- Freed    : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
+      DP("-- InUse    : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
+      DP("-- PeakUse  : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
+      DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+    }
+  }
+
+  // mark as deinitialized
+  L0Context = nullptr;
+}
+
+/// Allocate memory with the specified information
+void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+  assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
+          Kind == TARGET_ALLOC_SHARED) &&
+         "Unknown memory kind while allocating target memory");
+
+  std::lock_guard<std::mutex> Lock(Mtx);
+
+  // We do not expect meaningful Align parameter when Offset > 0, so the
+  // following code does not handle such case.
+
+  size_t AllocSize = Size + Offset;
+  void *Mem = nullptr;
+  void *AllocBase = nullptr;
+  const bool UseScratchPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH);
+  const bool UseZeroInitPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
+
+  if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+    // Pool is enabled for the allocation kind, and we do not use any memory
+    // advice. We should avoid using pool if there is any meaningful memory
+    // advice not to affect sibling allocation in the same block.
+    if (Align > 0)
+      AllocSize += (Align - 1);
+    size_t PoolAllocSize = 0;
+    if (UseScratchPool)
+      AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+    else if (UseZeroInitPool)
+      AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+    else
+      AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+    if (AllocBase) {
+      uintptr_t Base = (uintptr_t)AllocBase;
+      if (Align > 0)
+        Base = (Base + Align) & ~(Align - 1);
+      Mem = (void *)(Base + Offset);
+      AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+      log(Size, PoolAllocSize, Kind, true /* Pool */);
+      if (DevMalloc)
+        MemOwned.push_back(AllocBase);
+      if (UseDedicatedPool) {
+        DP("Allocated %zu bytes from %s pool\n", Size,
+           UseScratchPool ? "scratch" : "zero-initialized");
+      }
+      return Mem;
+    }
+  }
+
+  AllocBase = allocL0(AllocSize, Align, Kind, Size);
+  if (AllocBase) {
+    Mem = (void *)((uintptr_t)AllocBase + Offset);
+    AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+    if (DevMalloc)
+      MemOwned.push_back(AllocBase);
+    if (UseDedicatedPool) {
+      // We do not want this happen in general.
+      DP("Allocated %zu bytes from L0 for %s pool\n", Size,
+         UseScratchPool ? "scratch" : "zero-initialized");
+    }
+  }
+  return Mem;
+}
+
+/// Deallocate memory
+int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
+  MemAllocInfoTy Info;
+  if (!AllocInfo.remove(Ptr, &Info)) {
+    DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
+       DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  if (Info.InPool) {
+    size_t DeallocSize = 0;
+    if (Pools.count(Info.Kind) > 0)
+      DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+    if (DeallocSize == 0) {
+      // Try reduction scratch pool
+      DeallocSize = ReductionPool->dealloc(Info.Base);
+      // Try reduction counter pool
+      if (DeallocSize == 0)
+        DeallocSize = CounterPool->dealloc(Info.Base);
+      if (DeallocSize == 0) {
+        DP("Error: Cannot return memory " DPxMOD " to pool\n", DPxPTR(Ptr));
+        return OFFLOAD_FAIL;
+      }
+    }
+    log(0, DeallocSize, Info.Kind, true /* Pool */);
+    return OFFLOAD_SUCCESS;
+  }
+  if (!Info.Base) {
+    DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  CALL_ZE_RET_FAIL(zeMemFree, L0Context->getZeContext(), Info.Base);
+  log(0, Info.Size, Info.Kind);
+
+  DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
+     DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
+                                       size_t Size) {
+  return Device->enqueueMemCopy(Dst, Src, Size);
+}
+
+void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
+                              size_t ActiveSize) {
+  void *Mem = nullptr;
+  ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+                                        nullptr, 0, 0};
+  ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                    nullptr, 0};
+
+  // Use relaxed allocation limit if driver supports
+  ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{
+      ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr,
+      ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE};
+  if (Size > MaxAllocSize && SupportsLargeMem) {
+    DeviceDesc.pNext = &RelaxedDesc;
+    HostDesc.pNext = &RelaxedDesc;
+  }
+
+  auto zeDevice = Device ? Device->getZeDevice() : 0;
+  auto zeContext = L0Context->getZeContext();
+  bool makeResident = false;
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    makeResident = true;
+    CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+                     zeDevice, &Mem);
+    DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_HOST:
+    CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+    DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_SHARED:
+    CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+                     Align, zeDevice, &Mem);
+    DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  default:
+    assert(0 && "Invalid target data allocation kind");
+  }
+
+  size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+  log(LoggedSize, LoggedSize, Kind);
+  if (makeResident) {
+    assert(Device &&
+           "Device is not set for memory allocation. Is this a Device Pool?");
+    if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+      Mem = nullptr;
+  }
+  return Mem;
+}
+
+ze_event_handle_t EventPoolTy::getEvent() {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+
+  if (Events.empty()) {
+    // Need to create a new L0 pool
+    ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0};
+    Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
+    Desc.count = PoolSize;
+    ze_event_pool_handle_t Pool;
+    CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+    Pools.push_back(Pool);
+
+    // Create events
+    ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
+    EventDesc.wait = 0;
+    EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    for (uint32_t I = 0; I < PoolSize; I++) {
+      EventDesc.index = I;
+      ze_event_handle_t Event;
+      CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+      Events.push_back(Event);
+    }
+  }
+
+  auto Ret = Events.back();
+  Events.pop_back();
+
+  return Ret;
+}
+
+/// Return an event to the pool
+void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+  CALL_ZE_RET_VOID(zeEventHostReset, Event);
+  Events.push_back(Event);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
new file mode 100644
index 0000000000000..3acb2e78927e7
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -0,0 +1,371 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget.h"
+
+#include "L0Defs.h"
+#include "L0Options.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
+                                  int32_t CCSID) const {
+  if (ExplicitRootDevices.empty())
+    return false;
+  for (const auto &RootDev : ExplicitRootDevices) {
+    const auto ErootID = std::get<1>(RootDev);
+    if (ErootID != -2 && RootID != ErootID)
+      continue;
+    const auto EsubID = std::get<2>(RootDev);
+    if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
+      continue;
+    const auto ECCSID = std::get<3>(RootDev);
+    if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
+      continue;
+    // Check if isDiscard
+    if (!std::get<0>(RootDev))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+/// Read environment variables
+void L0OptionsTy::processEnvironmentVars() {
+  // Compilation options for IGC
+  UserCompilationOptions +=
+      std::string(" ") +
+      StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
+
+  // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
+  const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
+  if (DeviceSelectorVar.isPresent()) {
+    std::string EnvStr(std::move(DeviceSelectorVar.get()));
+    uint32_t numDiscard = 0;
+    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
+                   [](unsigned char C) { return std::tolower(C); });
+
+    std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+    for (const auto &Term : Entries) {
+      bool isDiscard = false;
+      std::vector<std::string_view> Pair = tokenize(Term, ":", true);
+      if (Pair.empty()) {
+        FAILURE_MESSAGE(
+            "Incomplete selector! Pair and device must be specified.\n");
+      } else if (Pair.size() == 1) {
+        FAILURE_MESSAGE("Incomplete selector!  Try '%s:*'if all devices "
+                        "under the Pair was original intention.\n",
+                        Pair[0].data());
+      } else if (Pair.size() > 2) {
+        FAILURE_MESSAGE(
+            "Error parsing selector string \"%s\" Too many colons (:)\n",
+            Term.data());
+      }
+      if (!((Pair[0][0] == '*') ||
+            (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
+            (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+        break;
+      isDiscard = Pair[0][0] == '!';
+      if (isDiscard)
+        numDiscard++;
+      else if (numDiscard > 0)
+        FAILURE_MESSAGE("All negative(discarding) filters must appear after "
+                        "all positive(accepting) filters!");
+
+      std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+      for (const auto &TargetStr : Targets) {
+        bool HasDeviceWildCard = false;
+        bool HasSubDeviceWildCard = false;
+        bool DeviceNum = false;
+        std::vector<std::string_view> DeviceSubTuple =
+            tokenize(TargetStr, ".", true);
+        int32_t RootD[3] = {-1, -1, -1};
+        if (DeviceSubTuple.empty()) {
+          FAILURE_MESSAGE(
+              "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
+              "specified.");
+        }
+
+        std::string_view TopDeviceStr = DeviceSubTuple[0];
+        static const std::array<std::string, 7> DeviceStr = {
+            "host", "cpu", "gpu", "acc", "fpga", "*"};
+        auto It =
+            find_if(DeviceStr.begin(), DeviceStr.end(),
+                    [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
+        if (It != DeviceStr.end()) {
+          if (TopDeviceStr[0] == '*') {
+            HasDeviceWildCard = true;
+            RootD[0] = -2;
+          } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+            continue;
+        } else {
+          std::string TDS(TopDeviceStr);
+          if (!isDigits(TDS)) {
+            FAILURE_MESSAGE("error parsing device number: %s",
+                            DeviceSubTuple[0].data());
+          } else {
+            RootD[0] = std::stoi(TDS);
+            DeviceNum = true;
+          }
+        }
+        if (DeviceSubTuple.size() >= 2) {
+          if (!DeviceNum && !HasDeviceWildCard)
+            FAILURE_MESSAGE("sub-devices can only be requested when parent "
+                            "device is specified by number or wildcard, not a "
+                            "device type like \'gpu\'");
+          std::string_view SubDeviceStr = DeviceSubTuple[1];
+          if (SubDeviceStr[0] == '*') {
+            HasSubDeviceWildCard = true;
+            RootD[1] = -2;
+          } else {
+            if (HasDeviceWildCard) // subdevice is a number and device is a *
+              FAILURE_MESSAGE(
+                  "sub-device can't be requested by number if parent "
+                  "device is specified by a wildcard.");
+
+            std::string SDS(SubDeviceStr);
+            if (!isDigits(SDS)) {
+              FAILURE_MESSAGE("error parsing subdevice index: %s",
+                              DeviceSubTuple[1].data());
+            } else
+              RootD[1] = std::stoi(SDS);
+          }
+        }
+        if (DeviceSubTuple.size() == 3) {
+          std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+          if (SubSubDeviceStr[0] == '*') {
+            RootD[2] = -2;
+          } else {
+            if (HasSubDeviceWildCard)
+              FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
+                              "sub-device before is specified by a wildcard.");
+            std::string SSDS(SubSubDeviceStr);
+            if (!isDigits(SSDS)) {
+              FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
+                              DeviceSubTuple[2].data());
+            } else
+              RootD[2] = std::stoi(SSDS);
+          }
+        } else if (DeviceSubTuple.size() > 3) {
+          FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
+                          "supported at this time ",
+                          TargetStr.data());
+        }
+        if (isDiscard)
+          ExplicitRootDevices.insert(
+              ExplicitRootDevices.begin(),
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+        else
+          ExplicitRootDevices.push_back(
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+      }
+    }
+  }
+
+  DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
+     ExplicitRootDevices.size());
+  DP("  (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
+     "-1(ignore)\n");
+  for (auto &T : ExplicitRootDevices) {
+    DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
+       std::get<2>(T), std::get<3>(T));
+    (void)T; // silence warning
+  }
+
+  // Memory pool
+  // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
+  //  <Option>       := 0 | <PoolInfoList>
+  //  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]
+  //  <PoolInfo>     := <MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]
+  //  <MemType>      := all | device | host | shared
+  //  <AllocMax>     := non-negative integer or empty, max allocation size in
+  //                    MB (default: 1)
+  //  <Capacity>     := positive integer or empty, number of allocations from
+  //                    a single block (default: 4)
+  //  <PoolSize>     := positive integer or empty, max pool size in MB
+  //                    (default: 256)
+  const StringEnvar MemoryPoolVar("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL", "");
+  if (MemoryPoolVar.isPresent()) {
+    if (MemoryPoolVar.get() == "0") {
+      Flags.UseMemoryPool = 0;
+      MemPoolInfo.clear();
+    } else {
+      std::istringstream Str(MemoryPoolVar.get());
+      int32_t MemType = -1;
+      int32_t Offset = 0;
+      int32_t Valid = 1;
+      const std::array<int32_t, 3> DefaultValue{1, 4, 256};
+      const int32_t AllMemType = INT32_MAX;
+      std::array<int32_t, 3> AllInfo{1, 4, 256};
+      std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+      for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
+        if (Token == "device") {
+          MemType = TARGET_ALLOC_DEVICE;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "host") {
+          MemType = TARGET_ALLOC_HOST;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "shared") {
+          MemType = TARGET_ALLOC_SHARED;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "all") {
+          MemType = AllMemType;
+          Offset = 0;
+          Valid = 2;
+        } else if (Offset < 3 && MemType >= 0) {
+          int32_t Num = std::atoi(Token.c_str());
+          bool ValidNum = (Num >= 0 && Offset == 0) || (Num > 0 && Offset > 0);
+          if (ValidNum && MemType == AllMemType)
+            AllInfo[Offset++] = Num;
+          else if (ValidNum)
+            PoolInfo[MemType][Offset++] = Num;
+          else if (Token.size() == 0)
+            Offset++;
+          else
+            Valid = 0;
+        } else {
+          Valid = 0;
+        }
+      }
+      if (Valid > 0) {
+        if (Valid == 2) {
+          // "all" is specified -- ignore other inputs
+          if (AllInfo[0] > 0) {
+            MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+          } else {
+            MemPoolInfo.clear();
+          }
+        } else {
+          // Use user-specified configuration
+          for (auto &I : PoolInfo) {
+            if (I.second[0] > 0)
+              MemPoolInfo[I.first] = I.second;
+            else
+              MemPoolInfo.erase(I.first);
+          }
+        }
+      } else {
+        DP("Ignoring incorrect memory pool configuration "
+           "LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=%s\n",
+           MemoryPoolVar.get().c_str());
+        DP("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>\n");
+        DP("  <Option>       := 0 | <PoolInfoList>\n");
+        DP("  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]\n");
+        DP("  <PoolInfo>     := "
+           "<MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]\n");
+        DP("  <MemType>      := all | device | host | shared\n");
+        DP("  <AllocMax>     := non-negative integer or empty, "
+           "max allocation size in MB (default: 1)\n");
+        DP("  <Capacity>     := positive integer or empty, "
+           "number of allocations from a single block (default: 4)\n");
+        DP("  <PoolSize>     := positive integer or empty, "
+           "max pool size in MB (default: 256)\n");
+      }
+    }
+  }
+
+  if (StringEnvar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS").isPresent()) {
+    // To match SYCL RT behavior, we just need to check whether
+    // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+    // does not matter.
+    CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE=<SizeInKB>
+  const Envar<size_t> StagingBufferSizeVar(
+      "LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE");
+  if (StagingBufferSizeVar.isPresent()) {
+    size_t SizeInKB = StagingBufferSizeVar;
+    if (SizeInKB > (16 << 10)) {
+      SizeInKB = (16 << 10);
+      DP("Staging buffer size is capped at %zu KB\n", SizeInKB);
+    }
+    StagingBufferSize = SizeInKB << 10;
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE=<Fmt>
+  // <Fmt> := sync | async | async_ordered
+  // sync: perform synchronization after each command
+  // async: perform synchronization when it is required
+  // async_ordered: same as "async", but command is ordered
+  // This option is ignored unless IMM is fully enabled on compute and copy.
+  // On Intel PVC GPU, when used with immediate command lists over Level Zero
+  // backend, a target region may involve multiple command submissions to the
+  // L0 copy queue and compute queue. L0 events are used for each submission
+  // (data transfer of a single item or kernel execution). When "async" is
+  // specified, a) each data transfer to device is submitted with an event.
+  // b) The kernel is submitted next with a dependence on all the previous
+  // data transfer events. The kernel also has an event associated with it.
+  // c) The data transfer from device will be submitted with a dependence on
+  // the kernel event. d) Finally wait on the host for all the events
+  // associated with the data transfer from device.
+  // The env-var also affects any "target update" constructs as well.
+  // The env-var only affects the L0 copy/compute commands issued from a
+  // single target construct execution, not across multiple invocations.
+  const StringEnvar CommandModeVar("LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE");
+  if (CommandModeVar.isPresent()) {
+    if (match(CommandModeVar, "sync"))
+      CommandMode = CommandModeTy::Sync;
+    else if (match(CommandModeVar, "async"))
+      CommandMode = CommandModeTy::Async;
+    else if (match(CommandModeVar, "async_ordered"))
+      CommandMode = CommandModeTy::AsyncOrdered;
+    else
+      INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
+                     CommandModeVar.get().c_str());
+  }
+}
+/// Parse String  and split into tokens of string_views based on the
+/// Delim character.
+std::vector<std::string_view>
+L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
+                      bool ProhibitEmptyTokens) {
+  std::vector<std::string_view> Tokens;
+  size_t Pos = 0;
+  size_t LastPos = 0;
+  while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
+    std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
+
+    if (!Tok.empty()) {
+      Tokens.push_back(Tok);
+    } else if (ProhibitEmptyTokens) {
+      FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
+                      "before '%s'delimiter is not allowed.",
+                      Delim.c_str());
+    }
+    // move the search starting index
+    LastPos = Pos + 1;
+  }
+
+  // Add remainder if any
+  if (LastPos < Filter.size()) {
+    std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
+    Tokens.push_back(Tok);
+  } else if ((LastPos != 0) && ProhibitEmptyTokens) {
+    // if delimiter is the last sybmol in the string.
+    FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
+                    "'%s' delimiter is not allowed.",
+                    Delim.c_str());
+  }
+  return Tokens;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
new file mode 100644
index 0000000000000..51d6595560484
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -0,0 +1,285 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/zes_api.h>
+
+#include "L0Device.h"
+#include "L0Interop.h"
+#include "L0Kernel.h"
+#include "L0Plugin.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+using namespace llvm::omp::target;
+using namespace error;
+
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+// Common data across all possible plugin instantiations
+L0OptionsTy LevelZeroPluginTy::Options;
+
+int32_t LevelZeroPluginTy::findDevices() {
+  CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+  uint32_t NumDrivers = 0;
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+  if (NumDrivers == 0) {
+    DP("Cannot find any drivers.\n");
+    return 0;
+  }
+  const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
+
+  // We expect multiple drivers on Windows to support different device types,
+  // so we need to maintain multiple drivers and contexts in general.
+  llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+
+  struct RootInfoTy {
+    uint32_t OrderId;
+    ze_device_handle_t zeDevice;
+    L0ContextTy *Driver;
+    bool IsDiscrete;
+  };
+  llvm::SmallVector<RootInfoTy> RootDevices;
+
+  uint32_t OrderId = 0;
+  for (uint32_t DriverId = 0; DriverId < NumDrivers; DriverId++) {
+    const auto &Driver = FoundDrivers[DriverId];
+    uint32_t DeviceCount = 0;
+    ze_result_t RC;
+    CALL_ZE(RC, zeDeviceGet, Driver, &DeviceCount, nullptr);
+    if (RC != ZE_RESULT_SUCCESS || DeviceCount == 0) {
+      DP("Cannot find any devices from driver " DPxMOD ".\n", DPxPTR(Driver));
+      continue;
+    }
+    // We have a driver that supports at least one device
+    ContextList.emplace_back(*this, Driver, DriverId);
+    auto &DrvInfo = ContextList.back();
+    llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
+    CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+
+    for (auto &zeDevice : FoundDevices)
+      RootDevices.push_back(
+          {OrderId++, zeDevice, &DrvInfo, L0DeviceTy::isDiscrete(zeDevice)});
+  }
+
+  // move discrete devices to the front
+  std::sort(RootDevices.begin(), RootDevices.end(),
+            [](const RootInfoTy &A, const RootInfoTy &B) {
+              // if both are discrete, order by OrderId
+              // if both are not discrete, order by OrderId
+              // Otherwise, discrete goes first
+
+              if (A.IsDiscrete && B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              if (!A.IsDiscrete && !B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              return A.IsDiscrete;
+            });
+
+  struct DeviceInfoTy {
+    L0DeviceIdTy Id;
+    L0ContextTy *Driver;
+    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+  };
+
+  llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
+
+  // helper lambdas
+  auto addDevice = [ExplicitMode,
+                    &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+                                   int32_t SubId = -1, int32_t CCSId = -1) {
+    if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
+      DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
+    }
+  };
+  for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
+    const auto zeDevice = RootDevices[RootId].zeDevice;
+    auto *RootDriver = RootDevices[RootId].Driver;
+    addDevice(zeDevice, RootDriver, RootId);
+  }
+  NumDevices = DevicesToAdd.size();
+  auto DeviceId = 0;
+  for (auto &DeviceInfo : DevicesToAdd) {
+    auto RootId = DeviceInfo.Id.RootId;
+    auto SubId = DeviceInfo.Id.SubId;
+    auto CCSId = DeviceInfo.Id.CCSId;
+    auto zeDevice = DeviceInfo.Id.zeId;
+    auto *Driver = DeviceInfo.Driver;
+
+    std::string IdStr = std::to_string(RootId) +
+                        (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+                        (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+    L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
+                                       zeDevice, *Driver, std::move(IdStr),
+                                       CCSId < 0 ? 0 : CCSId /* ComputeIndex */
+                                       ));
+    DeviceId++;
+  }
+
+  DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
+     getNumRootDevices(), NumDevices);
+  DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
+  for (auto &l0Device : L0Devices) {
+    DP("-- %s\n", l0Device->getZeIdCStr());
+    (void)l0Device; // silence warning
+  }
+
+  if (getDebugLevel() > 0) {
+    DP("Root Device Information\n");
+    for (uint32_t I = 0; I < getNumRootDevices(); I++) {
+      auto &l0Device = getDeviceFromId(I);
+      l0Device.reportDeviceInfo();
+    }
+  }
+
+  return getNumRootDevices();
+}
+
+/// Clean-up routine to be invoked by the destructor or
+/// LevelZeroPluginTy::deinit.
+void LevelZeroPluginTy::closeRTL() {
+
+  ContextTLSTable.clear();
+  DeviceTLSTable.clear();
+  ThreadTLSTable.clear();
+  ContextList.clear();
+
+  DP("Plugin closed successfully\n");
+}
+
+Expected<int32_t> LevelZeroPluginTy::initImpl() {
+  DP("Level0 NG plugin initialization\n");
+  // process options before anything else
+  Options.init();
+  return findDevices();
+}
+
+Error LevelZeroPluginTy::deinitImpl() {
+  DP("Deinit Level0 plugin!\n");
+  closeRTL();
+  return Plugin::success();
+}
+
+GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
+                                                 int32_t DeviceId,
+                                                 int32_t NumDevices) {
+  return &getDeviceFromId(DeviceId);
+}
+
+GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
+  return new L0GlobalHandlerTy();
+}
+
+uint16_t LevelZeroPluginTy::getMagicElfBits() const {
+  // TODO: We need to register a real ELF machine type
+  return 0x8086;
+}
+
+Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
+  return Triple::spirv64;
+}
+
+const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
+
+Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
+  return Plugin::success();
+}
+
+Expected<bool> LevelZeroPluginTy::isELFCompatible(uint32_t DeviceId,
+                                                  StringRef Image) const {
+  uint64_t MajorVer, MinorVer;
+  return isValidOneOmpImage(Image, MajorVer, MinorVer);
+}
+
+Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  // L0 object
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  auto &l0Device = getDeviceFromId(device_id);
+
+  // We can synchronize both L0 & SYCL objects with the same ze command
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with ImmCmdList barrier\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    auto Event = l0Device.getEvent();
+
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, Event, 0,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
+    l0Device.releaseEvent(Event);
+  } else {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with queue synchronize\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+  }
+
+  return Plugin::success();
+}
+
+Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  if (Interop->attrs.inorder)
+    return Plugin::success();
+
+  auto &l0Device = getDeviceFromId(device_id);
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::async_barrier: Appending ImmCmdList barrier "
+       "to " DPxMOD "\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
+                      nullptr);
+  } else {
+    DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
+       "\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    ze_command_list_handle_t CmdList = l0Device.getCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_level_zero() {
+  return new llvm::omp::target::plugin::LevelZeroPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
new file mode 100644
index 0000000000000..33c19b0e7c50d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -0,0 +1,625 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // !_WIN32
+
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                                     DeviceImageTy &Image,
+                                                     GlobalTy &DeviceGlobal) {
+  const char *GlobalName = DeviceGlobal.getName().data();
+
+  L0DeviceTy &l0Device = static_cast<L0DeviceTy &>(Device);
+  const L0ProgramTy *Program =
+      l0Device.getProgramFromImage(Image.getTgtImage());
+  void *Addr = Program->getOffloadVarDeviceAddr(GlobalName);
+
+  // Save the pointer to the symbol allowing nullptr.
+  DeviceGlobal.setPtr(Addr);
+
+  if (Addr == nullptr)
+    return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
+                         GlobalName);
+
+  return Plugin::success();
+}
+
+inline L0DeviceTy &L0ProgramTy::getL0Device() const {
+  return L0DeviceTy::makeL0Device(getDevice());
+}
+
+L0ProgramTy::~L0ProgramTy() {
+  for (auto *Kernel : Kernels) {
+    // We need explicit destructor and deallocate calls to release the kernels
+    // created by `GenericDeviceTy::constructKernel()`.
+    Kernel->~L0KernelTy();
+    getL0Device().getPlugin().free(Kernel);
+  }
+  for (auto Module : Modules) {
+    CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+  }
+}
+
+void L0ProgramTy::setLibModule() {
+#if _WIN32
+  return;
+#else
+  const auto *Image = getTgtImage();
+  const size_t NumEntries =
+      static_cast<size_t>(Image->EntriesEnd - Image->EntriesBegin);
+  for (size_t I = 0; I < NumEntries; I++) {
+    const auto &Entry = Image->EntriesBegin[I];
+    // Image contains a kernel, so it is not compiled as a library module
+    if (Entry.SymbolName && Entry.Size == 0)
+      return;
+  }
+  // Check if the image belongs to a dynamic library
+  Dl_info DLI{nullptr};
+  if (dladdr(Image->ImageStart, &DLI) && DLI.dli_fname) {
+    std::vector<uint8_t> FileBin;
+    auto Size = readFile(DLI.dli_fname, FileBin);
+    if (Size) {
+      auto MB = MemoryBuffer::getMemBuffer(
+          StringRef(reinterpret_cast<const char *>(FileBin.data()), Size),
+          /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      auto ELF = ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+      if (ELF) {
+        if (auto *Obj = dyn_cast<ELF64LEObjectFile>((*ELF).get())) {
+          const auto Header = Obj->getELFFile().getHeader();
+          if (Header.e_type == ELF::ET_DYN) {
+            DP("Processing current image as library\n");
+            IsLibModule = true;
+          }
+        }
+      }
+    }
+  }
+#endif // _WIN32
+}
+
+int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+                               const std::string &CommonBuildOptions,
+                               ze_module_format_t Format) {
+  const ze_module_constants_t SpecConstants =
+      LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
+  auto &l0Device = getL0Device();
+  std::string BuildOptions(CommonBuildOptions);
+
+  // Add required flag to enable dynamic linking.
+  if (IsLibModule)
+    BuildOptions += " -library-compilation ";
+
+  ze_module_desc_t ModuleDesc{};
+  ModuleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  ModuleDesc.pNext = nullptr;
+  ModuleDesc.format = Format;
+  ze_module_handle_t Module = nullptr;
+  ze_module_build_log_handle_t BuildLog = nullptr;
+  ze_result_t RC;
+
+  // Build a single module from a single image
+  ModuleDesc.inputSize = Size;
+  ModuleDesc.pInputModule = Image;
+  ModuleDesc.pBuildFlags = BuildOptions.c_str();
+  ModuleDesc.pConstants = &SpecConstants;
+  CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
+             l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
+
+  const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
+
+  if (BuildFailed) {
+    if (IsLibModule)
+      return OFFLOAD_SUCCESS;
+    return OFFLOAD_FAIL;
+  } else {
+    // Check if module link is required. We do not need this check for
+    // library module
+    if (!RequiresModuleLink && !IsLibModule) {
+      ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                           nullptr, 0};
+      CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+      RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+    }
+    // For now, assume the first module contains libraries, globals.
+    if (Modules.empty())
+      GlobalModule = Module;
+    Modules.push_back(Module);
+    l0Device.addGlobalModule(Module);
+    return OFFLOAD_SUCCESS;
+  }
+}
+
+int32_t L0ProgramTy::linkModules() {
+  auto &l0Device = getL0Device();
+  if (!RequiresModuleLink) {
+    DP("Module link is not required\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (Modules.empty()) {
+    DP("Invalid number of modules when linking modules\n");
+    return OFFLOAD_FAIL;
+  }
+
+  ze_result_t RC;
+  ze_module_build_log_handle_t LinkLog = nullptr;
+  CALL_ZE_RC(RC, zeModuleDynamicLink,
+             static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+             l0Device.getGlobalModulesArray(), &LinkLog);
+  const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
+  return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+}
+
+size_t L0ProgramTy::readFile(const char *FileName,
+                             std::vector<uint8_t> &OutFile) const {
+  std::ifstream IFS(FileName, std::ios::binary);
+  if (!IFS.good())
+    return 0;
+  IFS.seekg(0, IFS.end);
+  auto FileSize = static_cast<size_t>(IFS.tellg());
+  OutFile.resize(FileSize);
+  IFS.seekg(0);
+  if (!IFS.read(reinterpret_cast<char *>(OutFile.data()), FileSize)) {
+    OutFile.clear();
+    return 0;
+  }
+  return FileSize;
+}
+
+/// Read SPV from file name
+int32_t L0ProgramTy::readSPVFile(const char *FileName,
+                                 std::vector<uint8_t> &OutSPV) const {
+  // Resolve full path using the location of the plugin
+  std::string FullPath;
+#ifdef _WIN32
+  char RTLPath[_MAX_PATH];
+  HMODULE RTLModule = nullptr;
+  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                          (LPCSTR)&__tgt_target_data_begin_nowait,
+                          &RTLModule)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLPath;
+#else  // _WIN32
+  Dl_info RTLInfo;
+  if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLInfo.dli_fname;
+#endif // _WIN32
+  const size_t PathSep = FullPath.find_last_of("/\\");
+  FullPath.replace(PathSep + 1, std::string::npos, FileName);
+  // Read from the full path
+  if (!readFile(FullPath.c_str(), OutSPV)) {
+    DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                                   std::string &Options) const {
+  // Options that need to be replaced with backend-specific options
+  static const struct {
+    std::string Option;
+    std::string BackendOption;
+  } OptionTranslationTable[] = {
+      {"-ftarget-compile-fast",
+       "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"},
+      {"-foffload-fp32-prec-div", "-ze-fp32-correctly-rounded-divide-sqrt"},
+      {"-foffload-fp32-prec-sqrt", "-ze-fp32-correctly-rounded-divide-sqrt"},
+  };
+
+  for (const auto &OptPair : OptionTranslationTable) {
+    const size_t Pos = Options.find(OptPair.Option);
+    if (Pos != std::string::npos) {
+      Options.replace(Pos, OptPair.Option.length(), OptPair.BackendOption);
+    }
+  }
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  const auto MB = MemoryBuffer::getMemBuffer(Image,
+                                             /*BufferName=*/"",
+                                             /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  if (!ExpectedNewE) {
+    DP("Warning: unable to get ELF handle!\n");
+    return false;
+  }
+  bool Res = false;
+  auto processObjF = [&](const auto ELFObjF) {
+    if (!ELFObjF) {
+      DP("Warning: Unexpected ELF type!\n");
+      return false;
+    }
+    const auto &ELFF = ELFObjF->getELFFile();
+    auto Sections = ELFF.sections();
+    if (!Sections) {
+      DP("Warning: unable to get ELF sections!\n");
+      return false;
+    }
+    bool SeenOffloadSection = false;
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : ELFF.notes(Sec, Err)) {
+        if (Err) {
+          DP("Warning: unable to get ELF notes handle!\n");
+          return false;
+        }
+        if (Note.getName() != "INTELONEOMPOFFLOAD")
+          continue;
+        SeenOffloadSection = true;
+        if (Note.getType() != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+          continue;
+
+        std::string DescStr(std::move(Note.getDescAsStringRef(4).str()));
+        const auto DelimPos = DescStr.find('.');
+        if (DelimPos == std::string::npos) {
+          // The version has to look like "Major#.Minor#".
+          DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n",
+             DescStr.c_str());
+          return false;
+        }
+        const std::string MajorVerStr = DescStr.substr(0, DelimPos);
+        DescStr.erase(0, DelimPos + 1);
+        MajorVer = std::stoull(MajorVerStr);
+        MinorVer = std::stoull(DescStr);
+        return (MajorVer == 1 && MinorVer == 0);
+      }
+    }
+    return SeenOffloadSection;
+  };
+  if (const auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else if (const auto *O =
+                 dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+  return Res;
+}
+
+static StringRef getImageStringRef(const __tgt_device_image *Image) {
+  const char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+  const char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+  const size_t ImgSize = ImgEnd - ImgBegin;
+  return StringRef(ImgBegin, ImgSize);
+}
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  return isValidOneOmpImage(getImageStringRef(Image), MajorVer, MinorVer);
+}
+
+int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
+  auto &l0Device = getL0Device();
+  auto *Image = getTgtImage();
+  if (identify_magic(getImageStringRef(Image)) == file_magic::spirv_object) {
+    // Handle legacy plain SPIR-V image.
+    uint8_t *ImgBegin = reinterpret_cast<uint8_t *>(Image->ImageStart);
+    uint8_t *ImgEnd = reinterpret_cast<uint8_t *>(Image->ImageEnd);
+    size_t ImgSize = ImgEnd - ImgBegin;
+    return addModule(ImgSize, ImgBegin, BuildOptions,
+                     ZE_MODULE_FORMAT_IL_SPIRV);
+  }
+
+  uint64_t MajorVer, MinorVer;
+  if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+    DP("Warning: image is not a valid oneAPI OpenMP image.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  setLibModule();
+
+  // Iterate over the images and pick the first one that fits.
+  uint64_t ImageCount = 0;
+  struct V1ImageInfo {
+    // 0 - native, 1 - SPIR-V
+    uint64_t Format = (std::numeric_limits<uint64_t>::max)();
+    std::string CompileOpts;
+    std::string LinkOpts;
+    // We may have multiple sections created from split-kernel mode
+    std::vector<const uint8_t *> PartBegin;
+    std::vector<uint64_t> PartSize;
+
+    V1ImageInfo(uint64_t Format, std::string CompileOpts, std::string LinkOpts)
+        : Format(Format), CompileOpts(std::move(CompileOpts)),
+          LinkOpts(std::move(LinkOpts)) {}
+  };
+  std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+  auto MB = MemoryBuffer::getMemBuffer(getImageStringRef(Image),
+                                       /*BufferName=*/"",
+                                       /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  assert(ExpectedNewE &&
+         "isValidOneOmpImage() returns true for invalid ELF image");
+  auto processELF = [&](auto *EObj) {
+    assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
+    assert(MajorVer == 1 && MinorVer == 0 &&
+           "FIXME: update image processing for new oneAPI OpenMP version.");
+    const auto &E = EObj->getELFFile();
+    // Collect auxiliary information.
+    uint64_t MaxImageIdx = 0;
+
+    auto Sections = E.sections();
+    assert(Sections && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid sections.");
+
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : E.notes(Sec, Err)) {
+        assert(!Err && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid notes.");
+        if (Note.getName().str() != "INTELONEOMPOFFLOAD")
+          continue;
+
+        const uint64_t Type = Note.getType();
+        std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+        switch (Type) {
+        default:
+          DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+          ImageCount = std::stoull(DescStr);
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
+          std::vector<std::string> Parts;
+          do {
+            const auto DelimPos = DescStr.find('\0');
+            if (DelimPos == std::string::npos) {
+              Parts.push_back(std::move(DescStr));
+              break;
+            }
+            Parts.push_back(DescStr.substr(0, DelimPos));
+            DescStr.erase(0, DelimPos + 1);
+          } while (Parts.size() < 4);
+
+          // Ignore records with less than 4 strings.
+          if (Parts.size() != 4) {
+            DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+               "record is ignored.\n");
+            continue;
+          }
+
+          const uint64_t Idx = std::stoull(Parts[0]);
+          MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+          if (AuxInfo.find(Idx) != AuxInfo.end()) {
+            DP("Warning: duplicate auxiliary information for image %" PRIu64
+               " is ignored.\n",
+               Idx);
+            continue;
+          }
+          AuxInfo.emplace(
+              std::piecewise_construct, std::forward_as_tuple(Idx),
+              std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+          // Image pointer and size
+          // will be initialized later.
+        }
+        }
+      }
+    }
+
+    if (MaxImageIdx >= ImageCount)
+      DP("Warning: invalid image index found in auxiliary information.\n");
+
+    for (auto Sec : *Sections) {
+      const char *Prefix = "__openmp_offload_spirv_";
+      auto ExpectedSectionName = E.getSectionName(Sec);
+      assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
+                                    "image with invalid section names");
+      std::string SectionName = (*ExpectedSectionName).str();
+      if (SectionName.find(Prefix) != 0)
+        continue;
+      SectionName.erase(0, std::strlen(Prefix));
+
+      // Expected section name in split-kernel mode:
+      // __openmp_offload_spirv_<image_id>_<part_id>
+      auto PartIdLoc = SectionName.find("_");
+      if (PartIdLoc != std::string::npos) {
+        DP("Found a split section in the image\n");
+        // It seems that we do not need part ID as long as they are ordered
+        // in the image and we keep the ordering in the runtime.
+        SectionName.erase(PartIdLoc);
+      } else {
+        DP("Found a single section in the image\n");
+      }
+
+      uint64_t Idx = std::stoull(SectionName);
+      if (Idx >= ImageCount) {
+        DP("Warning: ignoring image section (index %" PRIu64
+           " is out of range).\n",
+           Idx);
+        continue;
+      }
+
+      auto AuxInfoIt = AuxInfo.find(Idx);
+      if (AuxInfoIt == AuxInfo.end()) {
+        DP("Warning: ignoring image section (no aux info).\n");
+        continue;
+      }
+      auto Contents = E.getSectionContents(Sec);
+      assert(Contents);
+      AuxInfoIt->second.PartBegin.push_back((*Contents).data());
+      AuxInfoIt->second.PartSize.push_back(Sec.sh_size);
+    }
+  };
+
+  if (auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else if (auto *O = dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+
+  for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+    const auto It = AuxInfo.find(Idx);
+    if (It == AuxInfo.end()) {
+      DP("Warning: image %" PRIu64
+         " without auxiliary information is ingored.\n",
+         Idx);
+      continue;
+    }
+
+    const auto NumParts = It->second.PartBegin.size();
+    // Split-kernel is not supported in SPIRV format
+    if (NumParts > 1 && It->second.Format != 0) {
+      DP("Warning: split-kernel images are not supported in SPIRV format\n");
+      continue;
+    }
+
+    // Skip unknown image format
+    if (It->second.Format != 0 && It->second.Format != 1) {
+      DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+      continue;
+    }
+
+    const bool IsBinary = (It->second.Format == 0);
+    const auto ModuleFormat =
+        IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
+    std::string Options = BuildOptions;
+    {
+      Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
+      replaceDriverOptsWithBackendOpts(l0Device, Options);
+    }
+
+    for (size_t I = 0; I < NumParts; I++) {
+      const unsigned char *ImgBegin =
+          reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
+      size_t ImgSize = It->second.PartSize[I];
+
+      auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
+
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Error: failed to create program from %s "
+           "(%" PRIu64 "-%zu).\n",
+           IsBinary ? "Binary" : "SPIR-V", Idx, I);
+        return OFFLOAD_FAIL;
+      }
+    }
+
+    DP("Created module from image #%" PRIu64 ".\n", Idx);
+    BuildOptions = std::move(Options);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  return OFFLOAD_FAIL;
+}
+
+void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+  DP("Looking up OpenMP global variable '%s'.\n", CName);
+
+  if (!GlobalModule || !CName)
+    return nullptr;
+
+  std::string Name(CName);
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  for (auto Module : Modules) {
+    CALL_ZE(RC, zeModuleGetGlobalPointer, Module, Name.c_str(), &SizeDummy,
+            &DevicePtr);
+    if (RC == ZE_RESULT_SUCCESS && DevicePtr)
+      return DevicePtr;
+  }
+  DP("Warning: global variable '%s' was not found in the device.\n",
+     Name.c_str());
+  return nullptr;
+}
+
+int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+                                        void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot read from device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
+}
+
+int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+                                         const void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot write to device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
+}
+
+int32_t L0ProgramTy::loadModuleKernels() {
+  // We need to build kernels here before filling the offload entries since we
+  // don't know which module contains a specific kernel with a name.
+  std::unordered_map<std::string, ze_kernel_handle_t> ModuleKernels;
+  for (auto Module : Modules) {
+    uint32_t Count = 0;
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+    if (Count == 0)
+      continue;
+
+    llvm::SmallVector<const char *> Names(Count);
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+
+    for (auto *Name : Names) {
+      KernelsToModuleMap.emplace(Name, Module);
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
new file mode 100644
index 0000000000000..3721d686393bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
@@ -0,0 +1,71 @@
+//===--- level_zero/src/OmpWrapper.cpp --------------------------- C++ -*-===//
+//
+// Implement wrapper for OpenMP compatibility through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "L0Defs.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(omp_get_max_teams, 0)
+DLWRAP_INTERNAL(omp_get_teams_thread_limit, 0)
+
+DLWRAP_FINALIZE()
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadOpenMP() {
+  static bool Loaded{false};
+  if (Loaded)
+    return true;
+
+  const char *OpenMPLibrary = "libomp.so";
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", OpenMPLibrary);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(OpenMPLibrary, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", OpenMPLibrary, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, OpenMPLibrary);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+int omp_get_max_teams() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_max_teams();
+}
+
+int omp_get_teams_thread_limit() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_teams_thread_limit();
+}

>From f8956cd31b8abdabe3714108489fd2795ffd6013 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:02:58 +0200
Subject: [PATCH 02/13] Update offload/CMakeLists.txt

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 8a704ab05eb53..3432ca3c29059 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -180,7 +180,7 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
         CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
   if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
     message(STATUS "Not building Level Zero plugin: it is only supported on "
-	           "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+	           "Linux/Windows x86_64 or ppc64le hosts")
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
   endif()
 endif()

>From d3fc4d70f7e62cf5e3993e3c431c1430a8ff2d22 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:03:12 +0200
Subject: [PATCH 03/13] Update
 offload/plugins-nextgen/level_zero/CMakeLists.txt

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/plugins-nextgen/level_zero/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index b9c8dd423c3ca..8e465d663655c 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
-return()
+  return()
 endif()
 
 # Create the library and add the default arguments.

>From 4b383862881d6201e44885da90f30c48312ab8dd Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 23:09:41 +0200
Subject: [PATCH 04/13] Update
 offload/plugins-nextgen/level_zero/include/L0Plugin.h

Co-authored-by: Alexey Sachkov <alexey.sachkov at intel.com>
---
 offload/plugins-nextgen/level_zero/include/L0Plugin.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index 4658c1cdab1df..de78ded59c2ce 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -45,7 +45,6 @@ class LevelZeroPluginTy final : public GenericPluginTy {
   /// L0 plugin global options
   static L0OptionsTy Options;
 
-  /// Global mutex
   std::mutex GlobalMutex;
 
   /// Common pool of AsyncQueue

>From 6c1c820a923a3f017f4bc9cf054d9c6b39bb6f77 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 10:47:46 +0200
Subject: [PATCH 05/13] Replace pragma once

---
 offload/plugins-nextgen/level_zero/include/AsyncQueue.h | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Context.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Defs.h     | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Device.h   | 4 +++-
 offload/plugins-nextgen/level_zero/include/L0Interop.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Kernel.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Memory.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Options.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Plugin.h   | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Program.h  | 5 ++++-
 offload/plugins-nextgen/level_zero/include/L0Trace.h    | 5 ++++-
 offload/plugins-nextgen/level_zero/include/TLS.h        | 5 ++++-
 12 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 105f68205e402..e26661a613772 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 
 #include <vector>
 
@@ -48,3 +49,5 @@ typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
index b2b6def8101ca..69748a3e61d01 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Context.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
 
 #include "L0Memory.h"
 #include "PerThreadTable.h"
@@ -136,3 +137,5 @@ class L0ContextTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 81566f52a2aea..05c287f4da013 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -9,7 +9,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
 
 #include "PluginInterface.h"
 #include "Shared/Requirements.h"
@@ -71,3 +72,5 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
                        __func__);
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 6acfa7e0ee67d..e22cfd928c0af 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
 
 #include "llvm/ADT/SmallVector.h"
 
@@ -678,3 +679,4 @@ class L0DeviceTy final : public GenericDeviceTy {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
index 4b8b417f9b339..69a1a5f274068 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Interop.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
 
 namespace llvm::omp::target::plugin::L0Interop {
 
@@ -23,3 +24,5 @@ struct Property {
 };
 
 } // namespace llvm::omp::target::plugin::L0Interop
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index bc6fc54cdea08..eca416d6fa882 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
 
 #include "L0Defs.h"
 #include "L0Trace.h"
@@ -152,3 +153,5 @@ class L0KernelTy : public GenericKernelTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index 50af80a19a93a..f5547201c994f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
 
 #include <cassert>
 #include <level_zero/ze_api.h>
@@ -572,3 +573,5 @@ class StagingBufferTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b3ecd25f56ddd..a501df693f311 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
 
 #include <level_zero/ze_api.h>
 
@@ -187,3 +188,5 @@ struct L0OptionsTy {
 }; // L0OptionsTy
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
index de78ded59c2ce..9fbdafa288592 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Plugin.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
 
 #include "AsyncQueue.h"
 #include "L0Defs.h"
@@ -133,3 +134,5 @@ class LevelZeroPluginTy final : public GenericPluginTy {
 };
 
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index a548b486f4642..d156cce268182 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
 
 #include "L0Kernel.h"
 
@@ -133,3 +134,5 @@ bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
 bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
                         uint64_t &MinorVer);
 } // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index 2eeae81016dee..f8519bd44ae79 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 // clang-format off
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
 
 #include "Shared/Debug.h"
 #include "omptarget.h"
@@ -191,3 +192,5 @@ inline const char *getZeErrorName(int32_t Error) {
     return "ZE_RESULT_ERROR_UNKNOWN";
   }
 }
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
index 8a5f41312e129..46086ee4b6d19 100644
--- a/offload/plugins-nextgen/level_zero/include/TLS.h
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -10,7 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma once
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
 
 #include "AsyncQueue.h"
 #include "L0Memory.h"
@@ -84,3 +85,5 @@ struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
 } // namespace target
 } // namespace omp
 } // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H

>From fd91c47605a4ad06d6a5780d69e530995f4e2035 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:20:34 +0200
Subject: [PATCH 06/13] Address review comments

---
 .../level_zero/include/AsyncQueue.h           |  2 ++
 .../level_zero/include/L0Defs.h               |  3 +-
 .../level_zero/include/L0Memory.h             |  2 +-
 .../level_zero/include/L0Options.h            |  8 ++---
 .../level_zero/src/L0Device.cpp               |  6 ++--
 .../level_zero/src/L0Kernel.cpp               | 10 +++++-
 .../level_zero/src/L0Options.cpp              |  2 +-
 .../level_zero/src/L0Program.cpp              | 32 +++++++++----------
 8 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index e26661a613772..2d32f1767a7b6 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -13,6 +13,8 @@
 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
 
+#include <list>
+#include <tuple>
 #include <vector>
 
 #include "L0Memory.h"
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
index 05c287f4da013..66d38cd7b9eb5 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Defs.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -1,4 +1,5 @@
 //===--- Level Zero Target RTL Implementation -----------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -43,7 +44,7 @@ LIBOMP_DECL(double, omp_get_wtime(void));
 namespace llvm::omp::target::plugin {
 
 /// Default alignmnet for allocation
-constexpr size_t L0Alignment = 0;
+constexpr size_t L0DefaultAlignment = 0;
 /// Default staging buffer size for host to device copy (16KB)
 constexpr size_t L0StagingBufferSize = (1 << 14);
 /// Default staging buffer count
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
index f5547201c994f..63115b1a3c529 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Memory.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -506,7 +506,7 @@ class StagingBufferTy {
     void *Ret = nullptr;
     size_t AllocSize = Size * Count;
     CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
-                     L0Alignment, &Ret);
+                     L0DefaultAlignment, &Ret);
     Buffers.push_back(Ret);
     return Ret;
   }
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index a501df693f311..7e64f71054569 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -141,9 +141,9 @@ struct L0OptionsTy {
 
   bool Init = false; // have the options already been processed
 
-  /// Read environment variables
   L0OptionsTy() {}
 
+  /// Read environment variables
   void processEnvironmentVars();
 
   void init() {
@@ -155,9 +155,9 @@ struct L0OptionsTy {
 
   /// Parse the string and split it into tokens of string_views based on the
   /// Delim character.
-  std::vector<std::string_view> tokenize(const std::string_view &Filter,
-                                         const std::string &Delim,
-                                         bool ProhibitEmptyTokens = false);
+  static std::vector<std::string_view>
+  tokenize(const std::string_view &Filter, const std::string &Delim,
+           bool ProhibitEmptyTokens = false);
 
   bool isDigits(const std::string_view &str) {
     if (str.size() == 0)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 0029d00a07685..2235741ea70a4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -64,15 +64,15 @@ constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0
 DeviceArchTy L0DeviceTy::computeArch() const {
   const auto PCIDeviceId = getPCIId();
   if (PCIDeviceId != 0) {
-    for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+    for (int ArchIndex = 0; ArchIndex < DeviceArchMapSize; ArchIndex++) {
       for (int i = 0;; i++) {
-        const auto Id = DeviceArchMap[arch].ids[i];
+        const auto Id = DeviceArchMap[ArchIndex].ids[i];
         if (Id == PCIIdTy::None)
           break;
 
         auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
         if (maskedId == Id)
-          return DeviceArchMap[arch].arch; // Exact match or prefix match
+          return DeviceArchMap[ArchIndex].arch; // Exact match or prefix match
       }
     }
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index d1cb0b7bd50bd..b0a13a07ab919 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,7 +163,9 @@ void L0KernelTy::decideKernelGroupArguments(
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
   bool UsedReductionSubscriptionRate = false;
   if (!MaxGroupCountForced) {
-    { GRPCounts[0] *= OptSubscRate; }
+    { 
+      GRPCounts[0] *= OptSubscRate; 
+    }
 
     size_t LoopTripcount = 0;
     if (LoopLevels) {
@@ -626,6 +628,12 @@ int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
     }
   } else {
     ze_event_handle_t Event = nullptr;
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, 0, nullptr);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, 0, nullptr);
     KernelLock.unlock();
     CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
     CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 3acb2e78927e7..cb3a23b3e8bd4 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -99,7 +99,7 @@ void L0OptionsTy::processEnvironmentVars() {
 
         std::string_view TopDeviceStr = DeviceSubTuple[0];
         static const std::array<std::string, 7> DeviceStr = {
-            "host", "cpu", "gpu", "acc", "fpga", "*"};
+            "host", "cpu", "gpu", "acc", "*"};
         auto It =
             find_if(DeviceStr.begin(), DeviceStr.end(),
                     [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 33c19b0e7c50d..9828f379e681a 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -132,22 +132,22 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
     if (IsLibModule)
       return OFFLOAD_SUCCESS;
     return OFFLOAD_FAIL;
-  } else {
-    // Check if module link is required. We do not need this check for
-    // library module
-    if (!RequiresModuleLink && !IsLibModule) {
-      ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
-                                           nullptr, 0};
-      CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
-      RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
-    }
-    // For now, assume the first module contains libraries, globals.
-    if (Modules.empty())
-      GlobalModule = Module;
-    Modules.push_back(Module);
-    l0Device.addGlobalModule(Module);
-    return OFFLOAD_SUCCESS;
   }
+
+  // Check if module link is required. We do not need this check for
+  // library module
+  if (!RequiresModuleLink && !IsLibModule) {
+    ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                         nullptr, 0};
+    CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+    RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+  }
+  // For now, assume the first module contains libraries, globals.
+  if (Modules.empty())
+    GlobalModule = Module;
+  Modules.push_back(Module);
+  l0Device.addGlobalModule(Module);
+  return OFFLOAD_SUCCESS;
 }
 
 int32_t L0ProgramTy::linkModules() {
@@ -376,8 +376,6 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
          "isValidOneOmpImage() returns true for invalid ELF image");
   auto processELF = [&](auto *EObj) {
     assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
-    assert(MajorVer == 1 && MinorVer == 0 &&
-           "FIXME: update image processing for new oneAPI OpenMP version.");
     const auto &E = EObj->getELFFile();
     // Collect auxiliary information.
     uint64_t MaxImageIdx = 0;

>From 84665dc22f710e4401e65dea6d20e18128c94daa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 11:44:35 +0200
Subject: [PATCH 07/13] Fix makefile format

---
 .../plugins-nextgen/level_zero/CMakeLists.txt | 92 ++++++++++---------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
index 8e465d663655c..df38671c040ab 100644
--- a/offload/plugins-nextgen/level_zero/CMakeLists.txt
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -6,64 +6,68 @@ endif()
 add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
 
 set(LEVEL_ZERO_SRC_FILES
-        src/L0Context.cpp
-        src/L0Device.cpp
-        src/L0Kernel.cpp
-        src/L0Memory.cpp
-        src/L0Program.cpp
-        src/L0Plugin.cpp
-        src/L0Program.cpp
-        src/L0Options.cpp
+  src/L0Context.cpp
+  src/L0Device.cpp
+  src/L0Kernel.cpp
+  src/L0Memory.cpp
+  src/L0Program.cpp
+  src/L0Plugin.cpp
+  src/L0Program.cpp
+  src/L0Options.cpp
 )
 list(APPEND LEVEL_ZERO_SRC_FILES
-        src/OmpWrapper.cpp
+  src/OmpWrapper.cpp
 )
 
 target_sources(omptarget.rtl.level_zero PRIVATE
-   ${LEVEL_ZERO_SRC_FILES}
+  ${LEVEL_ZERO_SRC_FILES}
 )
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
-      ${CMAKE_CURRENT_SOURCE_DIR}/include
-      ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
 target_include_directories(omptarget.rtl.level_zero PRIVATE
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
-      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
-      ${LIBOMPTARGET_OMP_HEADER_DIR}
+  ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_OMP_HEADER_DIR}
 )
 
 if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
-message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+  message(STATUS "Building Level Zero NG plugin linked against level_zero library")
 
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  target_link_libraries(omptarget.rtl.level_zero PRIVATE
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
-elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
-  # Full path to the L0 library is recognized as a linker option, so we
-  # separate directory and file name
-  get_filename_component(LEVEL_ZERO_LIBRARY_PATH
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
-  get_filename_component(LEVEL_ZERO_LIBRARY_NAME
-          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-  target_link_libraries(omptarget.rtl.level_zero PRIVATE
-          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
-  target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
-  target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
-  libomptarget_add_resource_file(omptarget.rtl.level_zero)
+  if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE
+                        ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+  elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # Full path to the L0 library is recognized as a linker option, so we
+    # separate directory and file name
+    get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+    get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+                           ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE
+                          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+    target_link_directories(omptarget.rtl.level_zero PRIVATE 
+                            ${LEVEL_ZERO_LIBRARY_PATH})
+    target_link_options(omptarget.rtl.level_zero PRIVATE 
+                        "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+    libomptarget_add_resource_file(omptarget.rtl.level_zero)
+  else()
+    message(FATAL_ERROR "Missing platform support")
+  endif()
 else()
-   message(FATAL_ERROR "Missing platfrom support")
-endif()
-
-else()
-message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
-get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
-if(CMAKE_SYSTEM_NAME MATCHES "Windows")
-   # Windows uses dll instead of lib files at runtime
-   string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
-endif()
-target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
-target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+  message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+  get_filename_component(LEVEL_ZERO_LIBRARY_NAME 
+                         ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+  if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # Windows uses dll instead of lib files at runtime
+    string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME 
+           ${LEVEL_ZERO_LIBRARY_NAME})
+  endif()
+  target_compile_options(omptarget.rtl.level_zero PRIVATE 
+                         "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+  target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
 endif()

>From a2217dbd426065c7b0f831a66947738a636b0c74 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Wed, 17 Sep 2025 14:56:40 +0200
Subject: [PATCH 08/13] change to StringRef in multiple places

---
 .../level_zero/include/AsyncQueue.h           |  2 +-
 .../level_zero/include/L0Options.h            | 29 ++-------
 .../level_zero/src/L0Device.cpp               |  7 ++-
 .../level_zero/src/L0Program.cpp              | 59 ++++++++++---------
 4 files changed, 43 insertions(+), 54 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
index 2d32f1767a7b6..dfa8c54b1c124 100644
--- a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Async Queue wrapper for SPIR-V/Xe machine
+// Async Queue wrapper for Level Zero
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index 7e64f71054569..ba62aa9ac0afa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -63,7 +63,6 @@ class SpecConstantsTy {
     return Tmp;
   }
 };
-#define FIXED static constexpr
 
 /// L0 Plugin flags
 struct L0OptionFlagsTy {
@@ -94,7 +93,7 @@ struct L0OptionsTy {
   std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
 
   /// Oversubscription rate for normal kernels
-  FIXED uint32_t SubscriptionRate = 4;
+  uint32_t SubscriptionRate = 4;
 
   /// Loop kernels with known ND-range may be known to have
   /// few iterations and they may not exploit the offload device
@@ -112,7 +111,7 @@ struct L0OptionsTy {
   /// in the kernel should decrease.
   /// Anyway, this is just a heuristics that seems to work well for some
   /// kernels (which poorly expose parallelism in the first place).
-  FIXED double ThinThreadsThreshold = 0.1;
+  double ThinThreadsThreshold = 0.1;
 
   /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
   /// All the discard filter should be before the accept filter.
@@ -127,8 +126,8 @@ struct L0OptionsTy {
   // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
   // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
   // builtins.
-  std::string CompilationOptions = "-cl-std=CL2.0 ";
-  std::string InternalCompilationOptions = "-cl-take-global-address";
+  static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
+  static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
   std::string UserCompilationOptions = "";
 
   // Spec constants used for all modules.
@@ -165,24 +164,8 @@ struct L0OptionsTy {
     return std::all_of(str.begin(), str.end(), ::isdigit);
   }
 
-  bool match(const std::string &Var, const std::string &Matched) {
-    if (Var.size() != Matched.size())
-      return false;
-
-    auto equals = [](char a, char b) {
-      return std::tolower(a) == std::tolower(b);
-    };
-    return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
-                      equals);
-  }
-
-  bool match(const std::string &Var, const char *Matched) {
-    std::string Str(Matched);
-    return match(Var, Str);
-  }
-
-  bool match(const StringEnvar &Var, const char *Matched) {
-    return match(Var.get(), Matched);
+  bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
+    return Matched.equals_insensitive(Var.get());
   }
 
 }; // L0OptionsTy
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 2235741ea70a4..1ef66751655d6 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -413,13 +413,14 @@ L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
   (void)NumEntries; // silence warning
 
   const auto &Options = getPlugin().getOptions();
-  std::string CompilationOptions(Options.CompilationOptions + " " +
-                                 Options.UserCompilationOptions);
+  std::string CompilationOptions(Options.CompilationOptions);
+  CompilationOptions += " " + Options.UserCompilationOptions;
 
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
        "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
 
-  CompilationOptions += " " + Options.InternalCompilationOptions;
+  CompilationOptions += " ";
+  CompilationOptions += Options.InternalCompilationOptions;
   auto &Program = addProgram(ImageId, TgtImage);
 
   int32_t RC = Program.buildModules(CompilationOptions);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 9828f379e681a..e7448757b9141 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -395,7 +395,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
           continue;
 
         const uint64_t Type = Note.getType();
-        std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+        auto DescStrRef = Note.getDescAsStringRef(4);
         switch (Type) {
         default:
           DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
@@ -403,19 +403,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
           break;
         case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
-          ImageCount = std::stoull(DescStr);
+          if (!DescStrRef.getAsInteger(10, ImageCount)) {
+            DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
+               DescStrRef.str().c_str());
+            ImageCount = 0;
+          }
           break;
-        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
-          std::vector<std::string> Parts;
-          do {
-            const auto DelimPos = DescStr.find('\0');
-            if (DelimPos == std::string::npos) {
-              Parts.push_back(std::move(DescStr));
-              break;
-            }
-            Parts.push_back(DescStr.substr(0, DelimPos));
-            DescStr.erase(0, DelimPos + 1);
-          } while (Parts.size() < 4);
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX:
+          llvm::SmallVector<llvm::StringRef, 4> Parts;
+          DescStrRef.split(Parts, '\0', /* MaxSplit = */ 4,
+                           /* KeepEmpty = */ false);
 
           // Ignore records with less than 4 strings.
           if (Parts.size() != 4) {
@@ -424,7 +421,8 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
             continue;
           }
 
-          const uint64_t Idx = std::stoull(Parts[0]);
+          uint64_t Idx = 0;
+          Parts[0].getAsInteger(10, Idx);
           MaxImageIdx = (std::max)(MaxImageIdx, Idx);
           if (AuxInfo.find(Idx) != AuxInfo.end()) {
             DP("Warning: duplicate auxiliary information for image %" PRIu64
@@ -432,13 +430,16 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
                Idx);
             continue;
           }
+
+          uint64_t Part1Id;
+          Parts[1].getAsInteger(10, Part1Id);
+
           AuxInfo.emplace(
               std::piecewise_construct, std::forward_as_tuple(Idx),
-              std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+              std::forward_as_tuple(Part1Id, Parts[2].str(), Parts[3].str()));
           // Image pointer and size
           // will be initialized later.
         }
-        }
       }
     }
 
@@ -450,24 +451,28 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
       auto ExpectedSectionName = E.getSectionName(Sec);
       assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
                                     "image with invalid section names");
-      std::string SectionName = (*ExpectedSectionName).str();
-      if (SectionName.find(Prefix) != 0)
+      auto &SectionNameRef = *ExpectedSectionName;
+      if (!SectionNameRef.consume_front(Prefix))
         continue;
-      SectionName.erase(0, std::strlen(Prefix));
 
       // Expected section name in split-kernel mode:
       // __openmp_offload_spirv_<image_id>_<part_id>
-      auto PartIdLoc = SectionName.find("_");
-      if (PartIdLoc != std::string::npos) {
-        DP("Found a split section in the image\n");
-        // It seems that we do not need part ID as long as they are ordered
-        // in the image and we keep the ordering in the runtime.
-        SectionName.erase(PartIdLoc);
-      } else {
+      auto Parts = SectionNameRef.split('_');
+      // It seems that we do not need part ID as long as they are ordered
+      // in the image and we keep the ordering in the runtime.
+      SectionNameRef = Parts.first;
+      if (Parts.second.empty()) {
         DP("Found a single section in the image\n");
+      } else {
+        DP("Found a split section in the image\n");
       }
 
-      uint64_t Idx = std::stoull(SectionName);
+      uint64_t Idx = 0;
+      if (!SectionNameRef.getAsInteger(10, Idx)) {
+        DP("Warning: ignoring image section (invalid index '%s').\n",
+           SectionNameRef.str().c_str());
+        continue;
+      }
       if (Idx >= ImageCount) {
         DP("Warning: ignoring image section (index %" PRIu64
            " is out of range).\n",

>From 08880a623e89a5eb7f9deebce9ced19e6d2b9e1a Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 09:49:34 +0200
Subject: [PATCH 09/13] remove tokenize

---
 .../level_zero/include/L0Options.h            |  12 --
 .../level_zero/src/L0Options.cpp              | 106 ++++++------------
 2 files changed, 37 insertions(+), 81 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index ba62aa9ac0afa..b08a07f52fcc0 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -152,18 +152,6 @@ struct L0OptionsTy {
     }
   }
 
-  /// Parse the string and split it into tokens of string_views based on the
-  /// Delim character.
-  static std::vector<std::string_view>
-  tokenize(const std::string_view &Filter, const std::string &Delim,
-           bool ProhibitEmptyTokens = false);
-
-  bool isDigits(const std::string_view &str) {
-    if (str.size() == 0)
-      return false;
-    return std::all_of(str.begin(), str.end(), ::isdigit);
-  }
-
   bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
     return Matched.equals_insensitive(Var.get());
   }
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index cb3a23b3e8bd4..d0871c715b180 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -53,43 +53,51 @@ void L0OptionsTy::processEnvironmentVars() {
   if (DeviceSelectorVar.isPresent()) {
     std::string EnvStr(std::move(DeviceSelectorVar.get()));
     uint32_t numDiscard = 0;
-    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
-                   [](unsigned char C) { return std::tolower(C); });
+    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
 
-    std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+    llvm::StringRef EnvRef(EnvStr);
+    llvm::SmallVector<llvm::StringRef> Entries;
+    EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
+                 /* KeepEmpty = */ false);
     for (const auto &Term : Entries) {
       bool isDiscard = false;
-      std::vector<std::string_view> Pair = tokenize(Term, ":", true);
-      if (Pair.empty()) {
+
+      auto Parts = Term.split(':');
+      if (Parts.first.empty()) {
         FAILURE_MESSAGE(
             "Incomplete selector! Pair and device must be specified.\n");
-      } else if (Pair.size() == 1) {
-        FAILURE_MESSAGE("Incomplete selector!  Try '%s:*'if all devices "
-                        "under the Pair was original intention.\n",
-                        Pair[0].data());
-      } else if (Pair.size() > 2) {
+      }
+      if (Parts.second.empty()) {
+        FAILURE_MESSAGE(
+            "Incomplete selector! Pair and device must be specified.\n");
+      }
+      if (Parts.second.contains(':')) {
         FAILURE_MESSAGE(
             "Error parsing selector string \"%s\" Too many colons (:)\n",
             Term.data());
       }
-      if (!((Pair[0][0] == '*') ||
-            (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
-            (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+
+      if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
+          Parts.first == "!level_zero"))
         break;
-      isDiscard = Pair[0][0] == '!';
+      isDiscard = Parts.first[0] == '!';
+
       if (isDiscard)
         numDiscard++;
       else if (numDiscard > 0)
         FAILURE_MESSAGE("All negative(discarding) filters must appear after "
                         "all positive(accepting) filters!");
 
-      std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+      llvm::SmallVector<llvm::StringRef> Targets;
+      Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
+                         /* KeepEmpty = */ false);
       for (const auto &TargetStr : Targets) {
         bool HasDeviceWildCard = false;
         bool HasSubDeviceWildCard = false;
         bool DeviceNum = false;
-        std::vector<std::string_view> DeviceSubTuple =
-            tokenize(TargetStr, ".", true);
+        llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
+        TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
+                         /* KeepEmpty = */ false);
         int32_t RootD[3] = {-1, -1, -1};
         if (DeviceSubTuple.empty()) {
           FAILURE_MESSAGE(
@@ -97,7 +105,7 @@ void L0OptionsTy::processEnvironmentVars() {
               "specified.");
         }
 
-        std::string_view TopDeviceStr = DeviceSubTuple[0];
+        auto TopDeviceStr = DeviceSubTuple[0];
         static const std::array<std::string, 7> DeviceStr = {
             "host", "cpu", "gpu", "acc", "*"};
         auto It =
@@ -107,15 +115,13 @@ void L0OptionsTy::processEnvironmentVars() {
           if (TopDeviceStr[0] == '*') {
             HasDeviceWildCard = true;
             RootD[0] = -2;
-          } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+          } else if (TopDeviceStr == "gpu")
             continue;
         } else {
-          std::string TDS(TopDeviceStr);
-          if (!isDigits(TDS)) {
+          if (TopDeviceStr.getAsInteger(10, RootD[0])) {
             FAILURE_MESSAGE("error parsing device number: %s",
-                            DeviceSubTuple[0].data());
+                            DeviceSubTuple[0].str().c_str());
           } else {
-            RootD[0] = std::stoi(TDS);
             DeviceNum = true;
           }
         }
@@ -124,7 +130,7 @@ void L0OptionsTy::processEnvironmentVars() {
             FAILURE_MESSAGE("sub-devices can only be requested when parent "
                             "device is specified by number or wildcard, not a "
                             "device type like \'gpu\'");
-          std::string_view SubDeviceStr = DeviceSubTuple[1];
+          auto SubDeviceStr = DeviceSubTuple[1];
           if (SubDeviceStr[0] == '*') {
             HasSubDeviceWildCard = true;
             RootD[1] = -2;
@@ -134,28 +140,24 @@ void L0OptionsTy::processEnvironmentVars() {
                   "sub-device can't be requested by number if parent "
                   "device is specified by a wildcard.");
 
-            std::string SDS(SubDeviceStr);
-            if (!isDigits(SDS)) {
+            if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
               FAILURE_MESSAGE("error parsing subdevice index: %s",
-                              DeviceSubTuple[1].data());
-            } else
-              RootD[1] = std::stoi(SDS);
+                              DeviceSubTuple[1].str().c_str());
+            }
           }
         }
         if (DeviceSubTuple.size() == 3) {
-          std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+          auto SubSubDeviceStr = DeviceSubTuple[2];
           if (SubSubDeviceStr[0] == '*') {
             RootD[2] = -2;
           } else {
             if (HasSubDeviceWildCard)
               FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
                               "sub-device before is specified by a wildcard.");
-            std::string SSDS(SubSubDeviceStr);
-            if (!isDigits(SSDS)) {
+            if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
               FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
-                              DeviceSubTuple[2].data());
-            } else
-              RootD[2] = std::stoi(SSDS);
+                              DeviceSubTuple[2].str().c_str());
+            }
           }
         } else if (DeviceSubTuple.size() > 3) {
           FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
@@ -333,39 +335,5 @@ void L0OptionsTy::processEnvironmentVars() {
                      CommandModeVar.get().c_str());
   }
 }
-/// Parse String  and split into tokens of string_views based on the
-/// Delim character.
-std::vector<std::string_view>
-L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
-                      bool ProhibitEmptyTokens) {
-  std::vector<std::string_view> Tokens;
-  size_t Pos = 0;
-  size_t LastPos = 0;
-  while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
-    std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
-
-    if (!Tok.empty()) {
-      Tokens.push_back(Tok);
-    } else if (ProhibitEmptyTokens) {
-      FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
-                      "before '%s'delimiter is not allowed.",
-                      Delim.c_str());
-    }
-    // move the search starting index
-    LastPos = Pos + 1;
-  }
-
-  // Add remainder if any
-  if (LastPos < Filter.size()) {
-    std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
-    Tokens.push_back(Tok);
-  } else if ((LastPos != 0) && ProhibitEmptyTokens) {
-    // if delimiter is the last sybmol in the string.
-    FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
-                    "'%s' delimiter is not allowed.",
-                    Delim.c_str());
-  }
-  return Tokens;
-}
 
 } // namespace llvm::omp::target::plugin

>From 9a3088c49bc52ce643c63223608802eefc04b924 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 12:47:02 +0200
Subject: [PATCH 10/13] remove unused code

---
 .../level_zero/include/L0Program.h            |  1 -
 .../level_zero/include/L0Trace.h              |  7 ----
 .../level_zero/src/L0Program.cpp              | 38 -------------------
 3 files changed, 46 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
index d156cce268182..ca8b3b8a5cf52 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Program.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -60,7 +60,6 @@ class L0ProgramTy : public DeviceImageTy {
                     const std::string &BuildOption, ze_module_format_t Format);
   /// Read file and return the size of the binary if successful.
   size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
-  int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
   void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
                                         std::string &Options) const;
 
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
index f8519bd44ae79..0faa76171cbc9 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Trace.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -27,13 +27,6 @@
       DP(__VA_ARGS__);                                                         \
   } while (0)
 
-#define FATAL_ERROR(Msg)                                                       \
-  do {                                                                         \
-    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
-    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
-    exit(EXIT_FAILURE);                                                        \
-  } while (0)
-
 #define WARNING(...)                                                           \
   do {                                                                         \
     fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index e7448757b9141..eb5da943d56c9 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -187,44 +187,6 @@ size_t L0ProgramTy::readFile(const char *FileName,
   return FileSize;
 }
 
-/// Read SPV from file name
-int32_t L0ProgramTy::readSPVFile(const char *FileName,
-                                 std::vector<uint8_t> &OutSPV) const {
-  // Resolve full path using the location of the plugin
-  std::string FullPath;
-#ifdef _WIN32
-  char RTLPath[_MAX_PATH];
-  HMODULE RTLModule = nullptr;
-  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
-                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                          (LPCSTR)&__tgt_target_data_begin_nowait,
-                          &RTLModule)) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  FullPath = RTLPath;
-#else  // _WIN32
-  Dl_info RTLInfo;
-  if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
-    DP("Error: module creation failed -- cannot resolve full path\n");
-    return OFFLOAD_FAIL;
-  }
-  FullPath = RTLInfo.dli_fname;
-#endif // _WIN32
-  const size_t PathSep = FullPath.find_last_of("/\\");
-  FullPath.replace(PathSep + 1, std::string::npos, FileName);
-  // Read from the full path
-  if (!readFile(FullPath.c_str(), OutSPV)) {
-    DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
 void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
                                                    std::string &Options) const {
   // Options that need to be replaced with backend-specific options

>From 24d06455603aaf372ce20a69b776ba237f1edaaa Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 13:04:12 +0200
Subject: [PATCH 11/13] fix format

---
 offload/plugins-nextgen/level_zero/include/L0Options.h | 3 ++-
 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp    | 4 ++--
 offload/plugins-nextgen/level_zero/src/L0Options.cpp   | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index b08a07f52fcc0..e383f070f10aa 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -127,7 +127,8 @@ struct L0OptionsTy {
   // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
   // builtins.
   static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
-  static constexpr std::string_view InternalCompilationOptions = "-cl-take-global-address";
+  static constexpr std::string_view InternalCompilationOptions =
+      "-cl-take-global-address";
   std::string UserCompilationOptions = "";
 
   // Spec constants used for all modules.
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b0a13a07ab919..538e627405b6d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -163,8 +163,8 @@ void L0KernelTy::decideKernelGroupArguments(
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
   bool UsedReductionSubscriptionRate = false;
   if (!MaxGroupCountForced) {
-    { 
-      GRPCounts[0] *= OptSubscRate; 
+    {
+      GRPCounts[0] *= OptSubscRate;
     }
 
     size_t LoopTripcount = 0;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index d0871c715b180..1e0baa3f2b089 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -78,7 +78,7 @@ void L0OptionsTy::processEnvironmentVars() {
       }
 
       if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
-          Parts.first == "!level_zero"))
+            Parts.first == "!level_zero"))
         break;
       isDiscard = Parts.first[0] == '!';
 
@@ -97,7 +97,7 @@ void L0OptionsTy::processEnvironmentVars() {
         bool DeviceNum = false;
         llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
         TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
-                         /* KeepEmpty = */ false);
+                        /* KeepEmpty = */ false);
         int32_t RootD[3] = {-1, -1, -1};
         if (DeviceSubTuple.empty()) {
           FAILURE_MESSAGE(
@@ -106,8 +106,8 @@ void L0OptionsTy::processEnvironmentVars() {
         }
 
         auto TopDeviceStr = DeviceSubTuple[0];
-        static const std::array<std::string, 7> DeviceStr = {
-            "host", "cpu", "gpu", "acc", "*"};
+        static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
+                                                             "gpu", "acc", "*"};
         auto It =
             find_if(DeviceStr.begin(), DeviceStr.end(),
                     [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });

>From 0eb57125a5caff61ac4de16256965f4fad0ad120 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 16:31:25 +0200
Subject: [PATCH 12/13] Remove environment variable

---
 .../level_zero/include/L0Options.h            |   7 -
 .../level_zero/src/L0Options.cpp              | 162 ------------------
 .../level_zero/src/L0Plugin.cpp               |  10 +-
 .../level_zero/src/L0Program.cpp              |   5 +-
 4 files changed, 4 insertions(+), 180 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
index e383f070f10aa..8c79a82ef724b 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Options.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -113,13 +113,6 @@ struct L0OptionsTy {
   /// kernels (which poorly expose parallelism in the first place).
   double ThinThreadsThreshold = 0.1;
 
-  /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
-  /// All the discard filter should be before the accept filter.
-  std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
-
-  /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
-  bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
-
   // Compilation options for IGC
   // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
   // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
index 1e0baa3f2b089..7229e2498ae13 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Options.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -18,29 +18,6 @@
 
 namespace llvm::omp::target::plugin {
 
-/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
-bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
-                                  int32_t CCSID) const {
-  if (ExplicitRootDevices.empty())
-    return false;
-  for (const auto &RootDev : ExplicitRootDevices) {
-    const auto ErootID = std::get<1>(RootDev);
-    if (ErootID != -2 && RootID != ErootID)
-      continue;
-    const auto EsubID = std::get<2>(RootDev);
-    if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
-      continue;
-    const auto ECCSID = std::get<3>(RootDev);
-    if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
-      continue;
-    // Check if isDiscard
-    if (!std::get<0>(RootDev))
-      return false;
-    return true;
-  }
-  return false;
-}
-
 /// Read environment variables
 void L0OptionsTy::processEnvironmentVars() {
   // Compilation options for IGC
@@ -48,145 +25,6 @@ void L0OptionsTy::processEnvironmentVars() {
       std::string(" ") +
       StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
 
-  // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
-  const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
-  if (DeviceSelectorVar.isPresent()) {
-    std::string EnvStr(std::move(DeviceSelectorVar.get()));
-    uint32_t numDiscard = 0;
-    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(), tolower);
-
-    llvm::StringRef EnvRef(EnvStr);
-    llvm::SmallVector<llvm::StringRef> Entries;
-    EnvRef.split(Entries, ';', /* MaxSplit = */ 0,
-                 /* KeepEmpty = */ false);
-    for (const auto &Term : Entries) {
-      bool isDiscard = false;
-
-      auto Parts = Term.split(':');
-      if (Parts.first.empty()) {
-        FAILURE_MESSAGE(
-            "Incomplete selector! Pair and device must be specified.\n");
-      }
-      if (Parts.second.empty()) {
-        FAILURE_MESSAGE(
-            "Incomplete selector! Pair and device must be specified.\n");
-      }
-      if (Parts.second.contains(':')) {
-        FAILURE_MESSAGE(
-            "Error parsing selector string \"%s\" Too many colons (:)\n",
-            Term.data());
-      }
-
-      if (!(Parts.first[0] == '*' || Parts.first == "level_zero" ||
-            Parts.first == "!level_zero"))
-        break;
-      isDiscard = Parts.first[0] == '!';
-
-      if (isDiscard)
-        numDiscard++;
-      else if (numDiscard > 0)
-        FAILURE_MESSAGE("All negative(discarding) filters must appear after "
-                        "all positive(accepting) filters!");
-
-      llvm::SmallVector<llvm::StringRef> Targets;
-      Parts.second.split(Targets, ',', /* MaxSplit = */ 0,
-                         /* KeepEmpty = */ false);
-      for (const auto &TargetStr : Targets) {
-        bool HasDeviceWildCard = false;
-        bool HasSubDeviceWildCard = false;
-        bool DeviceNum = false;
-        llvm::SmallVector<llvm::StringRef, 3> DeviceSubTuple;
-        TargetStr.split(DeviceSubTuple, '.', /* MaxSplit = */ 0,
-                        /* KeepEmpty = */ false);
-        int32_t RootD[3] = {-1, -1, -1};
-        if (DeviceSubTuple.empty()) {
-          FAILURE_MESSAGE(
-              "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
-              "specified.");
-        }
-
-        auto TopDeviceStr = DeviceSubTuple[0];
-        static const std::array<std::string, 7> DeviceStr = {"host", "cpu",
-                                                             "gpu", "acc", "*"};
-        auto It =
-            find_if(DeviceStr.begin(), DeviceStr.end(),
-                    [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
-        if (It != DeviceStr.end()) {
-          if (TopDeviceStr[0] == '*') {
-            HasDeviceWildCard = true;
-            RootD[0] = -2;
-          } else if (TopDeviceStr == "gpu")
-            continue;
-        } else {
-          if (TopDeviceStr.getAsInteger(10, RootD[0])) {
-            FAILURE_MESSAGE("error parsing device number: %s",
-                            DeviceSubTuple[0].str().c_str());
-          } else {
-            DeviceNum = true;
-          }
-        }
-        if (DeviceSubTuple.size() >= 2) {
-          if (!DeviceNum && !HasDeviceWildCard)
-            FAILURE_MESSAGE("sub-devices can only be requested when parent "
-                            "device is specified by number or wildcard, not a "
-                            "device type like \'gpu\'");
-          auto SubDeviceStr = DeviceSubTuple[1];
-          if (SubDeviceStr[0] == '*') {
-            HasSubDeviceWildCard = true;
-            RootD[1] = -2;
-          } else {
-            if (HasDeviceWildCard) // subdevice is a number and device is a *
-              FAILURE_MESSAGE(
-                  "sub-device can't be requested by number if parent "
-                  "device is specified by a wildcard.");
-
-            if (!SubDeviceStr.getAsInteger(10, RootD[1])) {
-              FAILURE_MESSAGE("error parsing subdevice index: %s",
-                              DeviceSubTuple[1].str().c_str());
-            }
-          }
-        }
-        if (DeviceSubTuple.size() == 3) {
-          auto SubSubDeviceStr = DeviceSubTuple[2];
-          if (SubSubDeviceStr[0] == '*') {
-            RootD[2] = -2;
-          } else {
-            if (HasSubDeviceWildCard)
-              FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
-                              "sub-device before is specified by a wildcard.");
-            if (!SubSubDeviceStr.getAsInteger(10, RootD[2])) {
-              FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
-                              DeviceSubTuple[2].str().c_str());
-            }
-          }
-        } else if (DeviceSubTuple.size() > 3) {
-          FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
-                          "supported at this time ",
-                          TargetStr.data());
-        }
-        if (isDiscard)
-          ExplicitRootDevices.insert(
-              ExplicitRootDevices.begin(),
-              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
-                                                          RootD[1], RootD[2]));
-        else
-          ExplicitRootDevices.push_back(
-              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
-                                                          RootD[1], RootD[2]));
-      }
-    }
-  }
-
-  DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
-     ExplicitRootDevices.size());
-  DP("  (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
-     "-1(ignore)\n");
-  for (auto &T : ExplicitRootDevices) {
-    DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
-       std::get<2>(T), std::get<3>(T));
-    (void)T; // silence warning
-  }
-
   // Memory pool
   // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
   //  <Option>       := 0 | <PoolInfoList>
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
index 51d6595560484..d632d57ce3d5d 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -35,7 +35,6 @@ int32_t LevelZeroPluginTy::findDevices() {
     DP("Cannot find any drivers.\n");
     return 0;
   }
-  const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
 
   // We expect multiple drivers on Windows to support different device types,
   // so we need to maintain multiple drivers and contexts in general.
@@ -93,13 +92,10 @@ int32_t LevelZeroPluginTy::findDevices() {
 
   llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
 
-  // helper lambdas
-  auto addDevice = [ExplicitMode,
-                    &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+  // helper lambda
+  auto addDevice = [&DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
                                    int32_t SubId = -1, int32_t CCSId = -1) {
-    if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
-      DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
-    }
+    DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
   };
   for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
     const auto zeDevice = RootDevices[RootId].zeDevice;
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index eb5da943d56c9..8b31bf7e3a7ec 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -128,11 +128,8 @@ int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
 
   const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
 
-  if (BuildFailed) {
-    if (IsLibModule)
-      return OFFLOAD_SUCCESS;
+  if (BuildFailed)
     return OFFLOAD_FAIL;
-  }
 
   // Check if module link is required. We do not need this check for
   // library module

>From f491f3dc412729f8e8d4288f7c12ecf659a14dcc Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Thu, 18 Sep 2025 18:42:05 +0200
Subject: [PATCH 13/13] fix getAsInteger conditions

---
 offload/plugins-nextgen/level_zero/src/L0Program.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
index 8b31bf7e3a7ec..68ef755b2a852 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -362,7 +362,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
         case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
           break;
         case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
-          if (!DescStrRef.getAsInteger(10, ImageCount)) {
+          if (DescStrRef.getAsInteger(10, ImageCount)) {
             DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
                DescStrRef.str().c_str());
             ImageCount = 0;
@@ -427,7 +427,7 @@ int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
       }
 
       uint64_t Idx = 0;
-      if (!SectionNameRef.getAsInteger(10, Idx)) {
+      if (SectionNameRef.getAsInteger(10, Idx)) {
         DP("Warning: ignoring image section (invalid index '%s').\n",
            SectionNameRef.str().c_str());
         continue;



More information about the llvm-commits mailing list