[llvm] [OFFLOAD] Add plugin with support for Intel oneAPI Level Zero (PR #158900)

Tue Sep 16 02:53:20 PDT 2025

https://github.com/adurang created https://github.com/llvm/llvm-project/pull/158900

Add a new nextgen plugin that supports GPU devices through the Intel oneAPI Level Zero library.

This PR also adds some new support to the PerThread auxiliary classes that is used by the plugin.

>From 0c427647d9ce0de9506992dfb16074178bebcc19 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran at intel.com>
Date: Tue, 16 Sep 2025 11:46:48 +0200
Subject: [PATCH] [OFFLOAD] Add plugin with support for Intel Level Zero

---
 offload/CMakeLists.txt                        |   17 +-
 .../Modules/LibomptargetGetDependencies.cmake |   21 +
 offload/include/OpenMP/InteropAPI.h           |    7 +-
 offload/include/PerThreadTable.h              |  155 ++-
 .../plugins-nextgen/common/include/DLWrap.h   |   16 +
 .../plugins-nextgen/level_zero/CMakeLists.txt |   69 ++
 .../level_zero/include/AsyncQueue.h           |   50 +
 .../level_zero/include/L0Context.h            |  138 +++
 .../level_zero/include/L0Defs.h               |   73 ++
 .../level_zero/include/L0Device.h             |  680 +++++++++++
 .../level_zero/include/L0Interop.h            |   25 +
 .../level_zero/include/L0Kernel.h             |  154 +++
 .../level_zero/include/L0Memory.h             |  574 +++++++++
 .../level_zero/include/L0Options.h            |  189 +++
 .../level_zero/include/L0Plugin.h             |  136 +++
 .../level_zero/include/L0Program.h            |  135 +++
 .../level_zero/include/L0Trace.h              |  193 +++
 .../plugins-nextgen/level_zero/include/TLS.h  |   86 ++
 .../level_zero/src/L0Context.cpp              |   41 +
 .../level_zero/src/L0Device.cpp               | 1065 +++++++++++++++++
 .../level_zero/src/L0DynWrapper.cpp           |  134 +++
 .../level_zero/src/L0Kernel.cpp               |  649 ++++++++++
 .../level_zero/src/L0Memory.cpp               |  637 ++++++++++
 .../level_zero/src/L0Options.cpp              |  371 ++++++
 .../level_zero/src/L0Plugin.cpp               |  285 +++++
 .../level_zero/src/L0Program.cpp              |  625 ++++++++++
 .../level_zero/src/OmpWrapper.cpp             |   71 ++
 27 files changed, 6586 insertions(+), 10 deletions(-)
 create mode 100644 offload/plugins-nextgen/level_zero/CMakeLists.txt
 create mode 100644 offload/plugins-nextgen/level_zero/include/AsyncQueue.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Context.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Defs.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Device.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Interop.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Kernel.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Memory.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Options.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Plugin.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Program.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/L0Trace.h
 create mode 100644 offload/plugins-nextgen/level_zero/include/TLS.h
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Context.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Device.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Memory.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Options.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/L0Program.cpp
 create mode 100644 offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index b277380783500..8a704ab05eb53 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
   message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
 endif()
 
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
 set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
-    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
 
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
@@ -176,6 +176,19 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
   endif()
 endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
+        CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
+  if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building Level Zero plugin: it is only supported on "
+	           "Linux/Windows x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+  endif()
+endif()
+if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
+		NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+    message(STATUS "Not building Level Zero plugin: dependencies not found")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 2a8bdebf2c1dd..0af0ae1ecdbec 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -89,4 +89,25 @@ if(LIBOMPTARGET_AMDGPU_ARCH)
   endif()
 endif()
 
+################################################################################
+# Looking for Level0
+################################################################################
+message(STATUS "Looking for Level0 includes.")
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS NAMES level_zero/ze_api.h)
+
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS)
+	set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+  message(STATUS "Could NOT find Level Zero. Missing includes.")
+else()
+  message(STATUS "Level Zero include DIR: ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}")
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
+  message(STATUS "Looking for Level Zero library.")
+  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES NAMES ze_loader)
+  if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES)
+    message(STATUS "Could NOT find Level Zero. Missing library.")
+  else()
+	  message(STATUS "Level Zero library: ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES}")
+  endif()
+endif()
+
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..2553bfa930784 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -160,17 +160,12 @@ struct InteropTableEntry {
     Interops.push_back(obj);
   }
 
-  template <class ClearFuncTy> void clear(ClearFuncTy f) {
-    for (auto &Obj : Interops) {
-      f(Obj);
-    }
-  }
-
   /// vector interface
   int size() const { return Interops.size(); }
   iterator begin() { return Interops.begin(); }
   iterator end() { return Interops.end(); }
   iterator erase(iterator it) { return Interops.erase(it); }
+  void clear() { Interops.clear(); }
 };
 
 struct InteropTblTy
diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 45b196171b4c8..0241370953c67 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -16,6 +16,60 @@
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
+
+template <typename ObjectType> struct PerThread {
+  struct PerThreadData {
+    std::unique_ptr<ObjectType> ThEntry;
+  };
+
+  std::mutex Mtx;
+  std::list<std::shared_ptr<PerThreadData>> ThreadDataList;
+
+  // define default constructors, disable copy and move constructors
+  PerThread() = default;
+  PerThread(const PerThread &) = delete;
+  PerThread(PerThread &&) = delete;
+  PerThread &operator=(const PerThread &) = delete;
+  PerThread &operator=(PerThread &&) = delete;
+  ~PerThread() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ThreadDataList.clear();
+  }
+
+private:
+  PerThreadData &getThreadData() {
+    static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
+    if (!ThData) {
+      ThData = std::make_shared<PerThreadData>();
+      std::lock_guard<std::mutex> Lock(Mtx);
+      ThreadDataList.push_back(ThData);
+    }
+    return *ThData;
+  }
+
+protected:
+  ObjectType &getThreadEntry() {
+    auto &ThData = getThreadData();
+    if (ThData.ThEntry)
+      return *ThData.ThEntry;
+    ThData.ThEntry = std::make_unique<ObjectType>();
+    return *ThData.ThEntry;
+  }
+
+public:
+  ObjectType &get() { return getThreadEntry(); }
+
+  template <class F> void clear(F f) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto ThData : ThreadDataList) {
+      if (!ThData->ThEntry)
+        continue;
+      f(*ThData->ThEntry);
+    }
+    ThreadDataList.clear();
+  }
+};
 
 // Using an STL container (such as std::vector) indexed by thread ID has
 // too many race conditions issues so we store each thread entry into a
@@ -23,10 +77,32 @@
 // T is the container type used to store the objects, e.g., std::vector,
 // std::set, etc. by each thread. O is the type of the stored objects e.g.,
 // omp_interop_val_t *, ...
-
 template <typename ContainerType, typename ObjectType> struct PerThreadTable {
   using iterator = typename ContainerType::iterator;
 
+  template <typename, typename = std::void_t<>>
+  struct has_iterator : std::false_type {};
+  template <typename T>
+  struct has_iterator<T, std::void_t<typename T::iterator>> : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clear : std::false_type {};
+  template <typename T>
+  struct has_clear<T, std::void_t<decltype(std::declval<T>().clear())>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clearAll : std::false_type {};
+  template <typename T>
+  struct has_clearAll<T, std::void_t<decltype(std::declval<T>().clearAll(1))>>
+      : std::true_type {};
+
+  template <typename, typename = std::void_t<>>
+  struct is_associative : std::false_type {};
+  template <typename T>
+  struct is_associative<T, std::void_t<typename T::mapped_type>>
+      : std::true_type {};
+
   struct PerThreadData {
     size_t NElements = 0;
     std::unique_ptr<ContainerType> ThEntry;
@@ -71,6 +147,11 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     return ThData.NElements;
   }
 
+  void setNElements(size_t Size) {
+    auto &NElements = getThreadNElements();
+    NElements = Size;
+  }
+
 public:
   void add(ObjectType obj) {
     auto &Entry = getThreadEntry();
@@ -104,11 +185,81 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     for (auto ThData : ThreadDataList) {
       if (!ThData->ThEntry || ThData->NElements == 0)
         continue;
-      ThData->ThEntry->clear(f);
+      if constexpr (has_clearAll<ContainerType>::value) {
+        ThData->ThEntry->clearAll(f);
+      } else if constexpr (has_iterator<ContainerType>::value &&
+                           has_clear<ContainerType>::value) {
+        for (auto &Obj : *ThData->ThEntry) {
+          if constexpr (is_associative<ContainerType>::value) {
+            f(Obj.second);
+          } else {
+            f(Obj);
+          }
+        }
+        ThData->ThEntry->clear();
+      } else {
+        static_assert(true, "Container type not supported");
+      }
       ThData->NElements = 0;
     }
     ThreadDataList.clear();
   }
 };
 
+template <typename T, typename = std::void_t<>> struct ContainerValueType {
+  using type = typename T::value_type;
+};
+template <typename T>
+struct ContainerValueType<T, std::void_t<typename T::mapped_type>> {
+  using type = typename T::mapped_type;
+};
+
+template <typename ContainerType, size_t reserveSize = 0>
+struct PerThreadContainer
+    : public PerThreadTable<ContainerType,
+                            typename ContainerValueType<ContainerType>::type> {
+
+  // helpers
+  template <typename T, typename = std::void_t<>> struct indexType {
+    using type = typename T::size_type;
+  };
+  template <typename T> struct indexType<T, std::void_t<typename T::key_type>> {
+    using type = typename T::key_type;
+  };
+  template <typename T, typename = std::void_t<>>
+  struct has_resize : std::false_type {};
+  template <typename T>
+  struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(1))>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_reserve : std::false_type {};
+  template <typename T>
+  struct has_reserve<T, std::void_t<decltype(std::declval<T>().reserve(1))>>
+      : std::true_type {};
+
+  using IndexType = typename indexType<ContainerType>::type;
+  using ObjectType = typename ContainerValueType<ContainerType>::type;
+
+  // Get the object for the given index in the current thread
+  ObjectType &get(IndexType Index) {
+    auto &Entry = this->getThreadEntry();
+
+    // specialized code for vector-like containers
+    if constexpr (has_resize<ContainerType>::value) {
+      if (Index >= Entry.size()) {
+        if constexpr (has_reserve<ContainerType>::value && reserveSize > 0) {
+          if (Entry.capacity() < reserveSize)
+            Entry.reserve(reserveSize);
+        }
+        // If the index is out of bounds, try resize the container
+        Entry.resize(Index + 1);
+      }
+    }
+    ObjectType &Ret = Entry[Index];
+    this->setNElements(Entry.size());
+    return Ret;
+  }
+};
+
 #endif
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
index 8934e7e701021..95ce86e123cd3 100644
--- a/offload/plugins-nextgen/common/include/DLWrap.h
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -282,5 +282,21 @@ template <size_t Requested, size_t Required> constexpr void verboseAssert() {
     return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
                                           x9, x10);                            \
   }
+#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9,                  \
+                        typename T::template arg<10>::type x10,                \
+                        typename T::template arg<11>::type x11) {              \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9, x10, x11);                       \
+  }
 
 #endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000000..b9c8dd423c3ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -0,0 +1,69 @@
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+return()
+endif()
+
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
+
+set(LEVEL_ZERO_SRC_FILES
+        src/L0Context.cpp
+        src/L0Device.cpp
+        src/L0Kernel.cpp
+        src/L0Memory.cpp
+        src/L0Program.cpp
+        src/L0Plugin.cpp
+        src/L0Program.cpp
+        src/L0Options.cpp
+)
+list(APPEND LEVEL_ZERO_SRC_FILES
+        src/OmpWrapper.cpp
+)
+
+target_sources(omptarget.rtl.level_zero PRIVATE
+   ${LEVEL_ZERO_SRC_FILES}
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/include
+      ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIRS}
+      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+      ${LIBOMPTARGET_OMP_HEADER_DIR}
+)
+
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES})
+elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  # Full path to the L0 library is recognized as a linker option, so we
+  # separate directory and file name
+  get_filename_component(LEVEL_ZERO_LIBRARY_PATH
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} DIRECTORY)
+  get_filename_component(LEVEL_ZERO_LIBRARY_NAME
+          ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+  target_link_libraries(omptarget.rtl.level_zero PRIVATE
+          ${LEVEL_ZERO_LIBRARY_NAME} ${LIBOMP_LIB_FILE})
+  target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
+  target_link_options(omptarget.rtl.level_zero PRIVATE "LINKER:-def:${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.def")
+  libomptarget_add_resource_file(omptarget.rtl.level_zero)
+else()
+   message(FATAL_ERROR "Missing platfrom support")
+endif()
+
+else()
+message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+get_filename_component(LEVEL_ZERO_LIBRARY_NAME ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARIES} NAME)
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # Windows uses dll instead of lib files at runtime
+   string(REGEX REPLACE "lib$" "dll" LEVEL_ZERO_LIBRARY_NAME ${LEVEL_ZERO_LIBRARY_NAME})
+endif()
+target_compile_options(omptarget.rtl.level_zero PRIVATE "-DLEVEL_ZERO_LIBRARY=\"${LEVEL_ZERO_LIBRARY_NAME}\"")
+target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+endif()
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
new file mode 100644
index 0000000000000..105f68205e402
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -0,0 +1,50 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Async Queue wrapper for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <vector>
+
+#include "L0Memory.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Abstract queue that supports asynchronous command submission
+struct AsyncQueueTy {
+  /// List of events attahced to submitted commands
+  std::vector<ze_event_handle_t> WaitEvents;
+  /// Pending staging buffer to host copies
+  std::list<std::tuple<void *, void *, size_t>> H2MList;
+  /// Pending USM memory copy commands that must wait for kernel completion
+  std::list<std::tuple<const void *, void *, size_t>> USM2MList;
+  /// Kernel event not signaled
+  ze_event_handle_t KernelEvent = nullptr;
+  /// Is this queue being used currently
+  bool InUse = false;
+  /// Clear data
+  void reset() {
+    WaitEvents.clear();
+    H2MList.clear();
+    USM2MList.clear();
+    KernelEvent = nullptr;
+  }
+};
+
+typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
new file mode 100644
index 0000000000000..b2b6def8101ca
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -0,0 +1,138 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Memory.h"
+#include "PerThreadTable.h"
+
+namespace llvm::omp::target::plugin {
+
+class LevelZeroPluginTy;
+
+class L0ContextTLSTy {
+  StagingBufferTy StagingBuffer;
+
+public:
+  auto &getStagingBuffer() { return StagingBuffer; }
+  const auto &getStagingBuffer() const { return StagingBuffer; }
+
+  void clear() { StagingBuffer.clear(); }
+};
+
+struct L0ContextTLSTableTy
+    : public PerThreadContainer<
+          std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
+  void clear() {
+    PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+/// Driver and context-specific resources. We assume a single context per
+/// driver.
+class L0ContextTy {
+  /// The plugin that created this context
+  LevelZeroPluginTy &Plugin;
+
+  /// Level Zero Driver handle
+  ze_driver_handle_t zeDriver = nullptr;
+
+  /// Common Level Zero context
+  ze_context_handle_t zeContext = nullptr;
+
+  /// API version supported by the Level Zero driver
+  ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT;
+
+  /// Imported external pointers. Track this only for user-directed
+  /// imports/releases.
+  std::unordered_map<uintptr_t, size_t> ImportedPtrs;
+
+  /// Common event pool
+  EventPoolTy EventPool;
+
+  /// Host Memory allocator for this driver
+  MemAllocatorTy HostMemAllocator;
+
+public:
+  /// Named constants for checking the imported external pointer regions.
+  static constexpr int32_t ImportNotExist = -1;
+  static constexpr int32_t ImportUnknown = 0;
+  static constexpr int32_t ImportExist = 1;
+
+  /// Create context, initialize event pool and extension functions
+  L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+              int32_t DriverId);
+
+  L0ContextTy(const L0ContextTy &) = delete;
+  L0ContextTy(L0ContextTy &&) = delete;
+  L0ContextTy &operator=(const L0ContextTy &) = delete;
+  L0ContextTy &operator=(const L0ContextTy &&) = delete;
+
+  /// Release resources
+  ~L0ContextTy() {
+    EventPool.deinit();
+    HostMemAllocator.deinit();
+    if (zeContext)
+      CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+  }
+
+  auto &getPlugin() const { return Plugin; }
+
+  StagingBufferTy &getStagingBuffer();
+
+  /// Add imported external pointer region.
+  void addImported(void *Ptr, size_t Size) {
+    (void)ImportedPtrs.emplace((uintptr_t)Ptr, Size);
+  }
+
+  /// Remove imported external pointer region
+  void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); }
+
+  /// Check if imported regions contain the specified region.
+  int32_t checkImported(void *Ptr, size_t Size) const {
+    uintptr_t LB = (uintptr_t)Ptr;
+    uintptr_t UB = LB + Size;
+    // We do not expect a large number of user-directed imports, so use simple
+    // logic.
+    for (auto &I : ImportedPtrs) {
+      uintptr_t ILB = I.first;
+      uintptr_t IUB = ILB + I.second;
+      if (LB >= ILB && UB <= IUB)
+        return ImportExist;
+      if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB))
+        return ImportUnknown;
+    }
+    return ImportNotExist;
+  }
+
+  ze_driver_handle_t getZeDriver() const { return zeDriver; }
+
+  /// Return context associated with the driver
+  ze_context_handle_t getZeContext() const { return zeContext; }
+
+  /// Return driver API version
+  ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
+
+  /// Return the event pool of this driver
+  auto &getEventPool() { return EventPool; }
+  const auto &getEventPool() const { return EventPool; }
+
+  bool supportsLargeMem() const {
+    // Large memory support is available since API version 1.1
+    return getDriverAPIVersion() >= ZE_API_VERSION_1_1;
+  }
+
+  const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
+  MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
new file mode 100644
index 0000000000000..81566f52a2aea
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -0,0 +1,73 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// External and other auxilary definitions
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "PluginInterface.h"
+#include "Shared/Requirements.h"
+#include "omptarget.h"
+
+#define LIBOMP_DECL(RetType, FnDecl) RetType __cdecl FnDecl
+
+enum class AllocOptionTy : int32_t {
+  ALLOC_OPT_NONE = 0,
+  ALLOC_OPT_REDUCTION_SCRATCH = 1,
+  ALLOC_OPT_REDUCTION_COUNTER = 2,
+  ALLOC_OPT_HOST_MEM = 3,
+  ALLOC_OPT_SLM = 4,
+};
+
+/// Host runtime routines being used
+extern "C" {
+LIBOMP_DECL(int, omp_get_max_teams(void));
+LIBOMP_DECL(int, omp_get_thread_limit(void));
+LIBOMP_DECL(int, omp_get_teams_thread_limit(void));
+LIBOMP_DECL(double, omp_get_wtime(void));
+} // extern "C"
+
+#ifndef EXTRACT_BITS
+// MSB=63, LSB=0
+#define EXTRACT_BITS(I64, HIGH, LOW)                                           \
+  (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
+#endif
+
+namespace llvm::omp::target::plugin {
+
+/// Default alignmnet for allocation
+constexpr size_t L0Alignment = 0;
+/// Default staging buffer size for host to device copy (16KB)
+constexpr size_t L0StagingBufferSize = (1 << 14);
+/// Default staging buffer count
+constexpr size_t L0StagingBufferCount = 64;
+/// USM allocation threshold where preallocation does not pay off (128MB)
+constexpr size_t L0UsmPreAllocThreshold = (128 << 20);
+/// Host USM allocation threshold where preallocation does not pay off (8MB)
+constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20);
+
+using namespace error;
+/// Generic L0 handle type
+using ZeHandleTy = void *;
+
+template <typename... ArgsTy>
+static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+
+  if (Code == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  const char *Desc = "Unknown error";
+  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+                                                    ErrFmt, Args..., Desc);
+}
+
+#define L0_UNIMPLEMENTED_ERR                                                   \
+  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet\n",   \
+                       __func__);
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
new file mode 100644
index 0000000000000..6acfa7e0ee67d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -0,0 +1,680 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+  None            = 0x0000,
+  SKL             = 0x1900,
+  KBL             = 0x5900,
+  CFL             = 0x3E00,
+  CFL_2           = 0x9B00,
+  ICX             = 0x8A00,
+  TGL             = 0xFF20,
+  TGL_2           = 0x9A00,
+  DG1             = 0x4900,
+  RKL             = 0x4C00,
+  ADLS            = 0x4600,
+  RTL             = 0xA700,
+  MTL             = 0x7D00,
+  PVC             = 0x0B00,
+  DG2_ATS_M       = 0x4F00,
+  DG2_ATS_M_2     = 0x5600,
+  LNL             = 0x6400,
+  BMG             = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+  DeviceArch_None   = 0,
+  DeviceArch_Gen    = 0x0001, // Gen 9, Gen 11 or Xe
+  DeviceArch_XeLPG  = 0x0002,
+  DeviceArch_XeHPC  = 0x0004,
+  DeviceArch_XeHPG  = 0x0008,
+  DeviceArch_Xe2LP  = 0x0010,
+  DeviceArch_Xe2HP  = 0x0020,
+  DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+  ze_device_handle_t zeId;
+  int32_t RootId;
+  int32_t SubId;
+  int32_t CCSId;
+
+  L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+               int32_t CCSId = -1)
+      : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+  /// Command list for each device
+  ze_command_list_handle_t CmdList = nullptr;
+
+  /// Main copy command list for each device
+  ze_command_list_handle_t CopyCmdList = nullptr;
+
+  /// Link copy command list for each device
+  ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+  /// Command queue for each device
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  /// Main copy command queue for each device
+  ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+  /// Link copy command queues for each device
+  ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+  /// Immediate command list for each device
+  ze_command_list_handle_t ImmCmdList = nullptr;
+
+  /// Immediate copy command list for each device
+  ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+  L0DeviceTLSTy() = default;
+  ~L0DeviceTLSTy() {
+    // assert all fields are nullptr on destruction
+    assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+    assert(CopyCmdList == nullptr &&
+           "CopyCmdList is not nullptr on destruction");
+    assert(LinkCopyCmdList == nullptr &&
+           "LinkCopyCmdList is not nullptr on destruction");
+    assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+    assert(CopyCmdQueue == nullptr &&
+           "CopyCmdQueue is not nullptr on destruction");
+    assert(LinkCopyCmdQueue == nullptr &&
+           "LinkCopyCmdQueue is not nullptr on destruction");
+    assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+    assert(ImmCopyCmdList == nullptr &&
+           "ImmCopyCmdList is not nullptr on destruction");
+  }
+
+  L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+    CmdList = std::exchange(Other.CmdList, nullptr);
+    CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+    LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+    CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+    CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+    LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+    ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+    ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+  }
+
+  void clear() {
+    // destroy all lists and queues
+    if (CmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+    if (CopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+    if (LinkCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+    if (ImmCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+    if (ImmCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+    if (CmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+    if (CopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+    if (LinkCopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+    CmdList = nullptr;
+    CopyCmdList = nullptr;
+    LinkCopyCmdList = nullptr;
+    CmdQueue = nullptr;
+    CopyCmdQueue = nullptr;
+    LinkCopyCmdQueue = nullptr;
+    ImmCmdList = nullptr;
+    ImmCopyCmdList = nullptr;
+  }
+
+  L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+  auto getCmdList() const { return CmdList; }
+  void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+  auto getCopyCmdList() const { return CopyCmdList; }
+  void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+    CopyCmdList = _CopyCmdList;
+  }
+
+  auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+  void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+    LinkCopyCmdList = _LinkCopyCmdList;
+  }
+
+  auto getImmCmdList() const { return ImmCmdList; }
+  void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+    ImmCmdList = _ImmCmdList;
+  }
+
+  auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+  void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+    ImmCopyCmdList = _ImmCopyCmdList;
+  }
+
+  auto getCmdQueue() const { return CmdQueue; }
+  void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+    CmdQueue = _CmdQueue;
+  }
+
+  auto getCopyCmdQueue() const { return CopyCmdQueue; }
+  void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+    CopyCmdQueue = _CopyCmdQueue;
+  }
+
+  auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+  void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+    LinkCopyCmdQueue = _LinkCopyCmdQueue;
+  }
+};
+
+struct L0DeviceTLSTableTy
+    : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+  void clear() {
+    PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+  // Level Zero Context for this Device
+  L0ContextTy &l0Context;
+
+  // Level Zero handle  for this Device
+  ze_device_handle_t zeDevice;
+  // Device Properties
+  ze_device_properties_t DeviceProperties{};
+  ze_device_compute_properties_t ComputeProperties{};
+  ze_device_memory_properties_t MemoryProperties{};
+  ze_device_cache_properties_t CacheProperties{};
+
+  /// Devices' default target allocation kind for internal allocation
+  int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+  DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+  std::string DeviceName;
+
+  /// Common indirect access flags for this device
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+  /// Device UUID for toplevel devices only
+  std::string DeviceUuid;
+
+  /// L0 Device ID as string
+  std::string zeId;
+
+  /// Command queue group ordinals for each device
+  std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals for copying
+  std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals and number of queues for link copy engines
+  std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+  /// Command queue index for each device
+  uint32_t ComputeIndex = 0;
+
+  bool IsAsyncEnabled = false;
+
+  // lock for this device
+  std::mutex Mutex;
+
+  /// Contains all modules (possibly from multiple device images) to handle
+  /// dynamic link across multiple images
+  llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+  /// L0 programs created for this device
+  std::list<L0ProgramTy> Programs;
+
+  /// MemAllocator for this device
+  MemAllocatorTy MemAllocator;
+
+  /// The current size of the global device memory pool (managed by us).
+  uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+  int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+  int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfo);
+  int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                       __tgt_async_info *AsyncInfo);
+
+  bool shouldSetupDeviceMemoryPool() const override { return false; }
+  DeviceArchTy computeArch() const;
+
+  /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+  /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+  Error internalInit();
+
+public:
+  L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+             ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+             const std::string &zeId, int32_t ComputeIndex)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {}),
+        l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+        ComputeIndex(ComputeIndex) {
+    DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    DeviceProperties.pNext = nullptr;
+    ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    ComputeProperties.pNext = nullptr;
+    MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+    MemoryProperties.pNext = nullptr;
+    CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+    CacheProperties.pNext = nullptr;
+
+    auto Err = internalInit();
+    if (Err) {
+      FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+                    toString(std::move(Err)).c_str());
+    }
+  }
+
+  static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+    return static_cast<L0DeviceTy &>(Device);
+  }
+  static L0DeviceTy &makeL0Device(ompt_device_t *Device) {
+    return *static_cast<L0DeviceTy *>(Device);
+  }
+
+  auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+  L0DeviceTLSTy &getTLS();
+
+  Error setContext() override { return Plugin::success(); }
+  Error initImpl(GenericPluginTy &Plugin) override;
+  Error deinitImpl() override {
+    Programs.clear();
+    return Plugin::success();
+  }
+
+  auto getZeDevice() const { return zeDevice; }
+
+  const L0ContextTy &getL0Context() const { return l0Context; }
+  L0ContextTy &getL0Context() { return l0Context; }
+
+  const std::string &getName() const { return DeviceName; }
+  const char *getNameCStr() const { return DeviceName.c_str(); }
+
+  const std::string &getZeId() const { return zeId; }
+  const char *getZeIdCStr() const { return zeId.c_str(); }
+
+  std::mutex &getMutex() { return Mutex; }
+
+  auto getComputeIndex() const { return ComputeIndex; }
+  auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+  auto getNumGlobalModules() const { return GlobalModules.size(); }
+  void addGlobalModule(ze_module_handle_t Module) {
+    GlobalModules.push_back(Module);
+  }
+  auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+  L0ProgramTy *getProgramFromImage(const __tgt_device_image *Image) {
+    for (auto &PGM : Programs)
+      if (PGM.getTgtImage() == Image)
+        return &PGM;
+    return nullptr;
+  }
+
+  int32_t buildAllKernels() {
+    for (auto &PGM : Programs) {
+      int32_t RC = PGM.loadModuleKernels();
+      if (RC != OFFLOAD_SUCCESS)
+        return RC;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  // add a new program to the device. Return a reference to the new program
+  auto &addProgram(int32_t ImageId, const __tgt_device_image *Image) {
+    Programs.emplace_back(ImageId, *this, Image);
+    return Programs.back();
+  }
+
+  const auto &getLastProgram() const { return Programs.back(); }
+  auto &getLastProgram() { return Programs.back(); }
+  // Device properties getters
+  auto getVendorId() const { return DeviceProperties.vendorId; }
+  bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+  auto getPCIId() const { return DeviceProperties.deviceId; }
+  auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+  auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+  auto getNumEUsPerSubslice() const {
+    return DeviceProperties.numEUsPerSubslice;
+  }
+  auto getNumSubslicesPerSlice() const {
+    return DeviceProperties.numSubslicesPerSlice;
+  }
+  auto getNumSlices() const { return DeviceProperties.numSlices; }
+  auto getNumSubslices() const {
+    return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+  }
+  uint32_t getNumEUs() const {
+    return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+  }
+  auto getTotalThreads() const {
+    return DeviceProperties.numThreadsPerEU * getNumEUs();
+  }
+  auto getNumThreadsPerSubslice() const {
+    return getNumEUsPerSubslice() * getNumThreadsPerEU();
+  }
+  auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+  auto getMaxSharedLocalMemory() const {
+    return ComputeProperties.maxSharedLocalMemory;
+  }
+  auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+  auto getCacheSize() const { return CacheProperties.cacheSize; }
+
+  int32_t getAllocKind() const { return AllocKind; }
+  DeviceArchTy getDeviceArch() const { return DeviceArch; }
+  bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+  static bool isDiscrete(uint32_t PCIId) {
+    switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+    case PCIIdTy::BMG:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  static bool isDiscrete(ze_device_handle_t Device) {
+    ze_device_properties_t PR{};
+    PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    PR.pNext = nullptr;
+    CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+    return isDiscrete(PR.deviceId);
+  }
+
+  bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+  bool isDeviceIPorNewer(uint32_t Version) const;
+
+  const std::string &getUuid() const { return DeviceUuid; }
+
+  uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+  uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+  bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+  uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+  uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+  uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+  bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+  bool deviceRequiresImmCmdList() const {
+    return isDeviceIPorNewer(0x05004000);
+  }
+  bool asyncEnabled() const { return IsAsyncEnabled; }
+  bool useImmForCompute() const { return true; }
+  bool useImmForCopy() const { return true; }
+  bool useImmForInterop() const { return true; }
+  bool forceInorderInterop() const { return true; }
+
+  void reportDeviceInfo() const;
+
+  // Command queues related functions
+  /// Create a command list with given ordinal and flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         ze_command_list_flags_t Flags,
+                                         const std::string &DeviceIdStr);
+
+  /// Create a command list with default flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         const std::string &DeviceIdStr);
+
+  ze_command_list_handle_t getCmdList();
+
+  /// Create a command queue with given ordinal and flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           ze_command_queue_flags_t Flags,
+                                           const std::string &DeviceIdStr);
+
+  /// Create a command queue with default flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           const std::string &DeviceIdStr,
+                                           bool InOrder = false);
+
+  /// Create a new command queue for the given OpenMP device ID
+  ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+  /// Create an immediate command list
+  ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+                                            bool InOrder = false);
+
+  /// Create an immediate command list for computing
+  ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+    return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+  }
+
+  /// Create an immediate command list for copying
+  ze_command_list_handle_t createImmCopyCmdList();
+  ze_command_queue_handle_t getCmdQueue();
+  ze_command_list_handle_t getCopyCmdList();
+  ze_command_queue_handle_t getCopyCmdQueue();
+  ze_command_list_handle_t getLinkCopyCmdList();
+  ze_command_queue_handle_t getLinkCopyCmdQueue();
+  ze_command_list_handle_t getImmCmdList();
+  ze_command_list_handle_t getImmCopyCmdList();
+
+  /// Enqueue copy command
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                         __tgt_async_info *AsyncInfo = nullptr,
+                         bool Locked = false, bool UseCopyEngine = true);
+
+  /// Enqueue asynchronous copy command
+  int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                              __tgt_async_info *AsyncInfo, bool CopyTo = true);
+
+  /// Enqueue fill command
+  int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+                         size_t Size);
+
+  /// Driver related functions
+
+  /// Reurn the driver handle for this device
+  ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); }
+
+  /// Return context for this device
+  ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); }
+
+  /// Return driver API version for this device
+  ze_api_version_t getDriverAPIVersion() const {
+    return l0Context.getDriverAPIVersion();
+  }
+
+  /// Return an event from the driver associated to this device
+  ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+
+  /// Release event to the pool associated to this device
+  void releaseEvent(ze_event_handle_t Event) {
+    l0Context.getEventPool().releaseEvent(Event, *this);
+  }
+
+  StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
+
+  bool supportsLargeMem() const { return l0Context.supportsLargeMem(); }
+
+  // Allocation related routines
+
+  /// Data alloc
+  void *dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+                  bool UserAlloc, bool DevMalloc = false,
+                  uint32_t MemAdvice = UINT32_MAX,
+                  AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+
+  /// Data delete
+  int32_t dataDelete(void *Ptr);
+
+  /// Return the memory allocation type for the specified memory location.
+  uint32_t getMemAllocType(const void *Ptr) const;
+
+  const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; }
+  MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; }
+
+  MemAllocatorTy &getMemAllocator(int32_t Kind) {
+    if (Kind == TARGET_ALLOC_HOST)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  MemAllocatorTy &getMemAllocator(const void *Ptr) {
+    bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr));
+    if (IsHostMem)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  int32_t makeMemoryResident(void *Mem, size_t Size);
+
+  // Generic device interface implementation
+  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                           int32_t ImageId) override;
+  Error unloadBinaryImpl(DeviceImageTy *Image) override;
+  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+  int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+
+  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN,
+                         "dataLockImpl not supported");
+  }
+  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
+
+  Expected<bool> isPinnedPtrImpl(void *, void *&, void *&,
+                                 size_t &) const override {
+    // Don't need to do anything, this is handled by the driver.
+    return false;
+  }
+
+  Error dataFence(__tgt_async_info *Async) override;
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool ReleaseQueue) override;
+  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override;
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initDeviceInfoImpl(__tgt_device_info *Info) override;
+  Expected<bool>
+  hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override{
+      L0_UNIMPLEMENTED_ERR}
+
+  /* Event routines are used to ensure ordering between dataTransfers. Instead
+   * of adding extra events in the queues, we make sure they're ordered by
+   * using the events from the data submission APIs so we don't need to support
+   * these routines.
+   * They still need to report succes to indicate the event are handled
+   * somewhere waitEvent and syncEvent should remain unimplemented
+   */
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    return true;
+  }
+
+  Error createEventImpl(void **EventPtrStorage) override {
+    return Plugin::success();
+  }
+  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Error syncEventImpl(void *EventPtr) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Expected<InfoTreeNode> obtainInfoImpl() override;
+
+  Error getDeviceStackSize(uint64_t &V) override {
+    V = 0;
+    return Plugin::success();
+  }
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override;
+
+  Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); }
+  Error getDeviceHeapSize(uint64_t &V) override {
+    V = HeapSize;
+    return Plugin::success();
+  }
+  Error setDeviceHeapSize(uint64_t V) override {
+    HeapSize = V;
+    return Plugin::success();
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override;
+  Error releaseInterop(omp_interop_val_t *Interop) override;
+
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
new file mode 100644
index 0000000000000..4b8b417f9b339
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -0,0 +1,25 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interop support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace llvm::omp::target::plugin::L0Interop {
+
+/// Level Zero interop property
+struct Property {
+  // Use this when command queue needs to be accessed as
+  // the targetsync field in interop will be changed if preferred type is sycl.
+  ze_command_queue_handle_t CommandQueue;
+  ze_command_list_handle_t ImmCmdList;
+};
+
+} // namespace llvm::omp::target::plugin::L0Interop
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
new file mode 100644
index 0000000000000..bc6fc54cdea08
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -0,0 +1,154 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+#include "PluginInterface.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+class L0ProgramTy;
+
+/// Loop descriptor
+struct TgtLoopDescTy {
+  int64_t Lb = 0;     // The lower bound of the i-th loop
+  int64_t Ub = 0;     // The upper bound of the i-th loop
+  int64_t Stride = 0; // The stride of the i-th loop
+};
+
+struct TgtNDRangeDescTy {
+  int32_t NumLoops = 0;      // Number of loops/dimensions
+  int32_t DistributeDim = 0; // Dimensions lower than this one
+                             // must end up in one WG
+  TgtLoopDescTy Levels[3];   // Up to 3 loops
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+  uint32_t Width = 0;
+  uint32_t SIMDWidth = 0;
+  uint32_t MaxThreadGroupSize = 0;
+
+  /// Cached input parameters used in the previous launch
+  TgtNDRangeDescTy LoopDesc;
+  int32_t NumTeams = -1;
+  int32_t ThreadLimit = -1;
+
+  /// Cached parameters used in the previous launch
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX;
+  uint32_t GroupSizes[3] = {0, 0, 0};
+  ze_group_count_t GroupCounts{0, 0, 0};
+  bool AllowCooperative = false;
+
+  std::mutex Mtx;
+
+  static constexpr TgtNDRangeDescTy LoopDescInit = {};
+
+  /// Check if we can reuse group parameters.
+  bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        uint32_t *_GroupSizes, ze_group_count_t &_GroupCounts,
+                        bool &_AllowCooperative) const {
+    if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (_NumTeams != NumTeams || _ThreadLimit != ThreadLimit)
+      return false;
+    // Found matching input parameters.
+    std::copy_n(GroupSizes, 3, _GroupSizes);
+    _GroupCounts = GroupCounts;
+    _AllowCooperative = AllowCooperative;
+    return true;
+  }
+
+  /// Update cached group parameters.
+  void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t _NumTeams, const int32_t _ThreadLimit,
+                        const uint32_t *_GroupSizes,
+                        const ze_group_count_t &_GroupCounts,
+                        const bool &_AllowCooperative) {
+    LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
+    NumTeams = _NumTeams;
+    ThreadLimit = _ThreadLimit;
+    std::copy_n(_GroupSizes, 3, GroupSizes);
+    GroupCounts = _GroupCounts;
+    AllowCooperative = _AllowCooperative;
+  }
+};
+
+class L0KernelTy : public GenericKernelTy {
+  // L0 Kernel Handle
+  ze_kernel_handle_t zeKernel;
+  // Kernel Properties
+  KernelPropertiesTy Properties;
+  auto &getProperties() { return Properties; }
+
+  int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+                              KernelLaunchParamsTy LaunchParams,
+                              __tgt_async_info *AsyncInfo) const;
+
+  void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
+                                  uint32_t ThreadLimit,
+                                  TgtNDRangeDescTy *LoopLevels,
+                                  uint32_t *GroupSizes,
+                                  ze_group_count_t &GroupCounts,
+                                  bool HalfNumThreads,
+                                  bool IsTeamsNDRange) const;
+
+  int32_t decideLoopKernelGroupArguments(
+      L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+      uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+      bool &AllowCooperative) const;
+
+  Error buildKernel(L0ProgramTy &Program);
+
+public:
+  /// Create a L0 kernel with a name and an execution mode.
+  L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
+  ~L0KernelTy() {
+    if (zeKernel)
+      CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
+  }
+  L0KernelTy(const L0KernelTy &) = delete;
+  L0KernelTy(L0KernelTy &&) = delete;
+  L0KernelTy &operator=(const L0KernelTy &) = delete;
+  L0KernelTy &operator=(const L0KernelTy &&) = delete;
+
+  const auto &getProperties() const { return Properties; }
+
+  /// Initialize the L0 kernel.
+  Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
+  /// Launch the L0 kernel function.
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
+                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   KernelLaunchParamsTy LaunchParams,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override{
+      L0_UNIMPLEMENTED_ERR}
+
+  ze_kernel_handle_t getZeKernel() const {
+    return zeKernel;
+  }
+
+  int32_t getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                         int32_t ThreadLimit, uint32_t *GroupSizes,
+                         ze_group_count_t &GroupCounts, void *LoopDesc,
+                         bool &AllowCooperative) const;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
new file mode 100644
index 0000000000000..50af80a19a93a
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -0,0 +1,574 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind)                                                \
+  (Kind == TARGET_ALLOC_HOST                                                   \
+       ? "host memory"                                                         \
+       : (Kind == TARGET_ALLOC_SHARED                                          \
+              ? "shared memory"                                                \
+              : (Kind == TARGET_ALLOC_DEVICE ? "device memory"                 \
+                                             : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+  /// Base address memory is allocated from
+  uintptr_t AllocBase = 0;
+  /// Minimal size served by the current heap
+  size_t BlockSize = 0;
+  /// Max size served by the current heap
+  size_t MaxSize = 0;
+  /// Available memory blocks
+  uint32_t NumBlocks = 0;
+  /// Number of block descriptors
+  uint32_t NumBlockDesc = 0;
+  /// Number of block counters
+  uint32_t NumBlockCounter = 0;
+  /// List of memory block descriptors
+  uint64_t *BlockDesc = nullptr;
+  /// List of memory block counters
+  uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+  /// Location of device memory blocks
+  void *PoolBase = nullptr;
+  /// Heap size common to all heaps
+  size_t HeapSize = 0;
+  /// Number of heaps available
+  uint32_t NumHeaps = 0;
+  /// Heap descriptors (using fixed-size array to simplify memory allocation)
+  DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+  /// Base address allocated from compute runtime
+  void *Base = nullptr;
+  /// Allocation size known to users/libomptarget
+  size_t Size = 0;
+  /// TARGET_ALLOC kind
+  int32_t Kind = TARGET_ALLOC_DEFAULT;
+  /// Allocation from pool?
+  bool InPool = false;
+  /// Is implicit argument
+  bool ImplicitArg = false;
+
+  MemAllocInfoTy() = default;
+
+  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+                 bool _ImplicitArg)
+      : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+        ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+  /// Simple memory allocation statistics. Maintains numbers for pool allocation
+  /// and GPU RT allocation.
+  struct MemStatTy {
+    size_t Requested[2] = {0, 0}; // Requested bytes
+    size_t Allocated[2] = {0, 0}; // Allocated bytes
+    size_t Freed[2] = {0, 0};     // Freed bytes
+    size_t InUse[2] = {0, 0};     // Current memory in use
+    size_t PeakUse[2] = {0, 0};   // Peak bytes used
+    size_t NumAllocs[2] = {0, 0}; // Number of allocations
+    MemStatTy() = default;
+  };
+
+  /// Memory pool which enables reuse of already allocated blocks
+  /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+  ///    memory.
+  /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+  /// -- Each memory block can allocate multiple fixed-size memory requested by
+  ///    offload RT or user.
+  /// -- Memory allocation falls back to GPU RT allocation when the pool size
+  ///    (total memory used by pool) reaches a threshold.
+  class MemPoolTy {
+
+    /// Memory block maintained in each bucket
+    struct BlockTy {
+      /// Base address of this block
+      uintptr_t Base = 0;
+      /// Size of the block
+      size_t Size = 0;
+      /// Supported allocation size by this block
+      size_t ChunkSize = 0;
+      /// Total number of slots
+      uint32_t NumSlots = 0;
+      /// Number of slots in use
+      uint32_t NumUsedSlots = 0;
+      /// Cached available slot returned by the last dealloc() call
+      uint32_t FreeSlot = UINT32_MAX;
+      /// Marker for the currently used slots
+      std::vector<bool> UsedSlots;
+
+      BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+        Base = reinterpret_cast<uintptr_t>(_Base);
+        Size = _Size;
+        ChunkSize = _ChunkSize;
+        NumSlots = Size / ChunkSize;
+        NumUsedSlots = 0;
+        UsedSlots.resize(NumSlots, false);
+      }
+
+      /// Check if the current block is fully used
+      bool isFull() const { return NumUsedSlots == NumSlots; }
+
+      /// Check if the given address belongs to the current block
+      bool contains(void *Mem) const {
+        auto M = reinterpret_cast<uintptr_t>(Mem);
+        return M >= Base && M < Base + Size;
+      }
+
+      /// Allocate a single chunk from the block
+      void *alloc();
+
+      /// Deallocate the given memory
+      void dealloc(void *Mem);
+    }; // BlockTy
+
+    /// Allocation kind for the current pool
+    int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+    /// Access to the allocator
+    MemAllocatorTy *Allocator = nullptr;
+    /// Minimum supported memory allocation size from pool
+    size_t AllocMin = 1 << 6; // 64B
+    /// Maximum supported memory allocation size from pool
+    size_t AllocMax = 0;
+    /// Allocation size when the pool needs to allocate a block
+    size_t AllocUnit = 1 << 16; // 64KB
+    /// Capacity of each block in the buckets which decides number of
+    /// allocatable chunks from the block. Each block in the bucket can serve
+    /// at least BlockCapacity chunks.
+    /// If ChunkSize * BlockCapacity <= AllocUnit
+    ///   BlockSize = AllocUnit
+    /// Otherwise,
+    ///   BlockSize = ChunkSize * BlockCapacity
+    /// This simply means how much memory is over-allocated.
+    uint32_t BlockCapacity = 0;
+    /// Total memory allocated from GPU RT for this pool
+    size_t PoolSize = 0;
+    /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+    /// when PoolSize reaches PoolSizeMax.
+    size_t PoolSizeMax = 0;
+    /// Small allocation size allowed in the pool even if pool size is over the
+    /// pool size limit
+    size_t SmallAllocMax = 1024;
+    /// Small allocation pool size
+    size_t SmallPoolSize = 0;
+    /// Small allocation pool size max (4MB)
+    size_t SmallPoolSizeMax = (4 << 20);
+    /// List of buckets
+    std::vector<std::vector<BlockTy *>> Buckets;
+    /// List of bucket parameters
+    std::vector<std::pair<size_t, size_t>> BucketParams;
+    /// Map from allocated pointer to corresponding block.
+    std::unordered_map<void *, BlockTy *> PtrToBlock;
+    /// Simple stats counting miss/hit in each bucket.
+    std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+    /// Need to zero-initialize after L0 allocation
+    bool ZeroInit = false;
+    /// Zero-initialized values to be copied to device
+    std::vector<char> ZeroInitValue;
+
+    /// Get bucket ID from the specified allocation size.
+    uint32_t getBucketId(size_t Size) {
+      uint32_t Count = 0;
+      for (size_t SZ = AllocMin; SZ < Size; Count++)
+        SZ <<= 1;
+      return Count;
+    }
+
+  public:
+    MemPoolTy() = default;
+
+    /// Construct pool with allocation kind, allocator, and user options.
+    MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+              const L0OptionsTy &Option);
+    // Used for reduction pool
+    MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    // Used for small memory pool with fixed parameters
+    MemPoolTy(MemAllocatorTy *_Allocator);
+
+    MemPoolTy(const MemPoolTy &) = delete;
+    MemPoolTy(MemPoolTy &&) = delete;
+    MemPoolTy &operator=(const MemPoolTy &) = delete;
+    MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+    void printUsage();
+    /// Release resources used in the pool.
+    ~MemPoolTy();
+
+    /// Allocate the requested size of memory from this pool.
+    /// AllocSize is the chunk size internally used for the returned memory.
+    void *alloc(size_t Size, size_t &AllocSize);
+    /// Deallocate the specified memory and returns block size deallocated.
+    size_t dealloc(void *Ptr);
+  }; // MemPoolTy
+
+  /// Allocation information maintained in the plugin
+  class MemAllocInfoMapTy {
+    /// Map from allocated pointer to allocation information
+    std::map<void *, MemAllocInfoTy> Map;
+    /// Map from target alloc kind to number of implicit arguments
+    std::map<int32_t, uint32_t> NumImplicitArgs;
+
+  public:
+    /// Add allocation information to the map
+    void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
+             bool InPool = false, bool ImplicitArg = false);
+
+    /// Remove allocation information for the given memory location
+    bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
+
+    /// Finds allocation information for the given memory location
+    const MemAllocInfoTy *find(void *Ptr) const {
+      auto AllocInfo = Map.find(Ptr);
+      if (AllocInfo == Map.end())
+        return nullptr;
+      else
+        return &AllocInfo->second;
+    }
+
+    /// Check if the map contains the given pointer and offset
+    bool contains(const void *Ptr, size_t Size) const {
+      if (Map.size() == 0)
+        return false;
+      auto I = Map.upper_bound(const_cast<void *>(Ptr));
+      if (I == Map.begin())
+        return false;
+      --I;
+      bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+                 (uintptr_t)Ptr + (uintptr_t)Size <=
+                     (uintptr_t)I->first + (uintptr_t)I->second.Size;
+      return Ret;
+    }
+
+    /// Returns the number of implicit arguments for the specified allocation
+    /// kind.
+    size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+  }; // MemAllocInfoMapTy
+
+  /// L0 context to use
+  const L0ContextTy *L0Context = nullptr;
+  /// L0 device to use
+  L0DeviceTy *Device = nullptr;
+  /// Whether the device supports large memory allocation
+  bool SupportsLargeMem = false;
+  /// Cached max alloc size supported by device
+  uint64_t MaxAllocSize = INT64_MAX;
+  /// Map from allocation kind to memory statistics
+  std::unordered_map<int32_t, MemStatTy> Stats;
+  /// Map from allocation kind to memory pool
+  std::unordered_map<int32_t, MemPoolTy> Pools;
+  /// Memory pool dedicated to reduction scratch space
+  std::unique_ptr<MemPoolTy> ReductionPool;
+  /// Memory pool dedicated to reduction counters
+  std::unique_ptr<MemPoolTy> CounterPool;
+  /// Allocation information map
+  MemAllocInfoMapTy AllocInfo;
+  /// RTL-owned memory that needs to be freed automatically
+  std::list<void *> MemOwned;
+  /// Lock protection
+  std::mutex Mtx;
+  /// Allocator only supports host memory
+  bool IsHostMem = false;
+  // Internal deallocation function to be called when already
+  // hondling the Mtx lock
+  int32_t dealloc_locked(void *Ptr);
+
+public:
+  MemAllocatorTy() = default;
+
+  MemAllocatorTy(const MemAllocatorTy &) = delete;
+  MemAllocatorTy(MemAllocatorTy &&) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
+
+  /// Release resources and report statistics if requested
+  ~MemAllocatorTy() {
+    if (L0Context)
+      deinit(); // Release resources
+  }
+  void deinit();
+
+  /// Allocator only supports host memory
+  bool supportsHostMem() { return IsHostMem; }
+
+  void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+  void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+  void updateMaxAllocSize(L0DeviceTy &L0Device);
+
+  /// Allocate memory from L0 GPU RT. We use over-allocation workaround
+  /// to support target pointer with offset, and positive "ActiveSize" is
+  /// specified in such cases for correct debug logging.
+  void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+
+  /// Allocate memory with the specified information from a memory pool
+  void *alloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+              bool UserAlloc, bool DevMalloc, uint32_t MemAdvice,
+              AllocOptionTy AllocOpt);
+
+  /// Deallocate memory
+  int32_t dealloc(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return dealloc_locked(Ptr);
+  }
+
+  /// Check if the given memory location and offset belongs to any allocated
+  /// memory
+  bool contains(const void *Ptr, size_t Size) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.contains(Ptr, Size);
+  }
+
+  /// Get allocation information for the specified memory location
+  const MemAllocInfoTy *getAllocInfo(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.find(Ptr);
+  }
+
+  /// Get kernel indirect access flags using implicit argument info
+  ze_kernel_indirect_access_flags_t getIndirectFlags() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ze_kernel_indirect_access_flags_t Ret = 0;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+    return Ret;
+  }
+
+  /// Log memory allocation/deallocation
+  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+    if (Stats.count(Kind) == 0)
+      return; // Stat is disabled
+
+    auto &ST = Stats[Kind];
+    int32_t I = Pool ? 1 : 0;
+    if (ReqSize > 0) {
+      ST.Requested[I] += ReqSize;
+      ST.Allocated[I] += Size;
+      ST.InUse[I] += Size;
+      ST.NumAllocs[I]++;
+    } else {
+      ST.Freed[I] += Size;
+      ST.InUse[I] -= Size;
+    }
+    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+  }
+
+  /// Perform copy operation
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+}; /// MemAllocatorTy
+
+// simple generic wrapper to reuse objects
+// objects must have zero argument accessible constructor
+template <class ObjTy> class ObjPool {
+  // Protection
+  std::unique_ptr<std::mutex> Mtx;
+  // List of Objects
+  std::list<ObjTy *> Objects;
+
+public:
+  ObjPool() { Mtx.reset(new std::mutex); }
+
+  ObjPool(const ObjPool &) = delete;
+  ObjPool(ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &&) = delete;
+
+  ObjTy *get() {
+    if (!Objects.empty()) {
+      std::lock_guard<std::mutex> Lock(*Mtx);
+      if (!Objects.empty()) {
+        const auto Ret = Objects.back();
+        Objects.pop_back();
+        return Ret;
+      }
+    }
+    return new ObjTy();
+  }
+
+  void release(ObjTy *obj) {
+    std::lock_guard<std::mutex> Lock(*Mtx);
+    Objects.push_back(obj);
+  }
+
+  ~ObjPool() {
+    for (auto object : Objects)
+      delete object;
+  }
+};
+
+/// Common event pool used in the plugin. This event pool assumes all events
+/// from the pool are host-visible and use the same event pool flag.
+class EventPoolTy {
+  /// Size of L0 event pool created on demand
+  size_t PoolSize = 64;
+
+  /// Context of the events
+  ze_context_handle_t Context = nullptr;
+
+  /// Additional event pool flags common to this pull
+  uint32_t Flags = 0;
+
+  /// Protection
+  std::unique_ptr<std::mutex> Mtx;
+
+  /// List of created L0 event pools
+  std::list<ze_event_pool_handle_t> Pools;
+
+  /// List of free L0 events
+  std::list<ze_event_handle_t> Events;
+
+#ifdef OMPT_SUPPORT
+  /// Event to OMPT record map. The timestamp information is recorded to the
+  /// OMPT record before the event is recycled.
+  std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
+#endif // OMPT_SUPPORT
+
+public:
+  /// Initialize context, flags, and mutex
+  void init(ze_context_handle_t _Context, uint32_t _Flags) {
+    Context = _Context;
+    Flags = _Flags;
+    Mtx.reset(new std::mutex);
+  }
+
+  /// Destroys L0 resources
+  void deinit() {
+    for (auto E : Events)
+      CALL_ZE_RET_VOID(zeEventDestroy, E);
+    for (auto P : Pools)
+      CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+  }
+
+  /// Get a free event from the pool
+  ze_event_handle_t getEvent();
+
+  /// Return an event to the pool
+  void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+};
+
+/// Staging buffer
+/// A single staging buffer is not enough when batching is enabled since there
+/// can be multiple pending copy operations.
+class StagingBufferTy {
+  /// Context for L0 calls
+  ze_context_handle_t Context = nullptr;
+  /// Max allowed size for staging buffer
+  size_t Size = L0StagingBufferSize;
+  /// Number of buffers allocated together
+  size_t Count = L0StagingBufferCount;
+  /// Buffers increasing by Count if a new buffer is required
+  std::list<void *> Buffers;
+  /// Next buffer location in the buffers
+  size_t Offset = 0;
+
+  void *addBuffers() {
+    ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                       nullptr, 0};
+    void *Ret = nullptr;
+    size_t AllocSize = Size * Count;
+    CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+                     L0Alignment, &Ret);
+    Buffers.push_back(Ret);
+    return Ret;
+  }
+
+public:
+  StagingBufferTy() = default;
+  StagingBufferTy(const StagingBufferTy &) = delete;
+  StagingBufferTy(StagingBufferTy &&) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
+
+  ~StagingBufferTy() {
+    if (initialized())
+      clear();
+  }
+
+  void clear() {
+    ze_result_t Rc;
+    (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
+    for (auto Ptr : Buffers)
+      CALL_ZE(Rc, zeMemFree, Context, Ptr);
+    Context = nullptr;
+  }
+
+  bool initialized() const { return Context != nullptr; }
+
+  void init(ze_context_handle_t _Context, size_t _Size, size_t _Count) {
+    Context = _Context;
+    Size = _Size;
+    Count = _Count;
+  }
+
+  void reset() { Offset = 0; }
+
+  /// Always return the first buffer
+  void *get() {
+    if (Size == 0 || Count == 0)
+      return nullptr;
+    return Buffers.empty() ? addBuffers() : Buffers.front();
+  }
+
+  /// Return the next available buffer
+  void *getNext() {
+    void *Ret = nullptr;
+    if (Size == 0 || Count == 0)
+      return Ret;
+
+    size_t AllocSize = Size * Count;
+    bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
+    if (NeedToGrow)
+      Ret = addBuffers();
+    else
+      Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+
+    if (!Ret)
+      return nullptr;
+
+    Offset += Size;
+    return Ret;
+  }
+
+  /// Return either a fixed buffer or next buffer
+  void *get(bool Next) { return Next ? getNext() : get(); }
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
new file mode 100644
index 0000000000000..b3ecd25f56ddd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+  std::vector<uint32_t> ConstantIds;
+  std::vector<const void *> ConstantValues;
+
+public:
+  SpecConstantsTy() = default;
+  SpecConstantsTy(const SpecConstantsTy &) = delete;
+  SpecConstantsTy(SpecConstantsTy &&) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+  SpecConstantsTy(const SpecConstantsTy &&Other)
+      : ConstantIds(std::move(Other.ConstantIds)),
+        ConstantValues(std::move(Other.ConstantValues)) {}
+
+  ~SpecConstantsTy() {
+    for (auto I : ConstantValues) {
+      const char *ValuePtr = reinterpret_cast<const char *>(I);
+      delete[] ValuePtr;
+    }
+  }
+
+  template <typename T> void addConstant(uint32_t Id, T Val) {
+    const size_t ValSize = sizeof(Val);
+    char *ValuePtr = new char[ValSize];
+    *reinterpret_cast<T *>(ValuePtr) = Val;
+
+    ConstantIds.push_back(Id);
+    ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+  }
+
+  ze_module_constants_t getModuleConstants() const {
+    ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+                              ConstantIds.data(),
+                              // Unfortunately we have to const_cast it.
+                              // L0 data type should probably be fixed.
+                              const_cast<const void **>(ConstantValues.data())};
+    return Tmp;
+  }
+};
+#define FIXED static constexpr
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+  uint64_t UseMemoryPool : 1;
+  uint64_t Reserved : 63;
+  L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+  /// Binary flags
+  L0OptionFlagsTy Flags;
+
+  /// Staging buffer size
+  size_t StagingBufferSize = L0StagingBufferSize;
+
+  /// Staging buffer count
+  size_t StagingBufferCount = L0StagingBufferCount;
+
+  // TODO: This should probably be an array indexed by AllocKind
+  /// Memory pool parameters
+  /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+  std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+      {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+      {TARGET_ALLOC_HOST, {1, 4, 256}},
+      {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+  /// Parameters for memory pools dedicated to reduction scratch space
+  std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+  /// Oversubscription rate for normal kernels
+  FIXED uint32_t SubscriptionRate = 4;
+
+  /// Loop kernels with known ND-range may be known to have
+  /// few iterations and they may not exploit the offload device
+  /// to the fullest extent.
+  /// Let's assume a device has N total HW threads available,
+  /// and the kernel requires M hardware threads with LWS set to L.
+  /// If (M < N * ThinThreadsThreshold), then we will try
+  /// to iteratively divide L by 2 to increase the number of HW
+  /// threads used for executing the kernel. Effectively, we will
+  /// end up with L less than the kernel's SIMD width, so the HW
+  /// threads will not use all their SIMD lanes. This (presumably) should
+  /// allow more parallelism, because the stalls in the SIMD lanes
+  /// will be distributed across more HW threads, and the probability
+  /// of having a stall (or a sequence of stalls) on a critical path
+  /// in the kernel should decrease.
+  /// Anyway, this is just a heuristics that seems to work well for some
+  /// kernels (which poorly expose parallelism in the first place).
+  FIXED double ThinThreadsThreshold = 0.1;
+
+  /// List of Root devices provided via option ONEAPI_DEVICE_SELECTOR
+  /// All the discard filter should be before the accept filter.
+  std::vector<std::tuple<bool, int32_t, int32_t, int32_t>> ExplicitRootDevices;
+
+  /// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+  bool shouldAddDevice(int32_t RootID, int32_t SubID, int32_t CCSID) const;
+
+  // Compilation options for IGC
+  // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+  // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+  // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+  // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+  // builtins.
+  std::string CompilationOptions = "-cl-std=CL2.0 ";
+  std::string InternalCompilationOptions = "-cl-take-global-address";
+  std::string UserCompilationOptions = "";
+
+  // Spec constants used for all modules.
+  SpecConstantsTy CommonSpecConstants;
+
+  /// Command execution mode.
+  /// Whether the runtime uses asynchronous mode or not depends on the type of
+  /// devices and whether immediate command list is fully enabled.
+  CommandModeTy CommandMode = CommandModeTy::Async;
+
+  bool Init = false; // have the options already been processed
+
+  /// Read environment variables
+  L0OptionsTy() {}
+
+  void processEnvironmentVars();
+
+  void init() {
+    if (!Init) {
+      processEnvironmentVars();
+      Init = true;
+    }
+  }
+
+  /// Parse the string and split it into tokens of string_views based on the
+  /// Delim character.
+  std::vector<std::string_view> tokenize(const std::string_view &Filter,
+                                         const std::string &Delim,
+                                         bool ProhibitEmptyTokens = false);
+
+  bool isDigits(const std::string_view &str) {
+    if (str.size() == 0)
+      return false;
+    return std::all_of(str.begin(), str.end(), ::isdigit);
+  }
+
+  bool match(const std::string &Var, const std::string &Matched) {
+    if (Var.size() != Matched.size())
+      return false;
+
+    auto equals = [](char a, char b) {
+      return std::tolower(a) == std::tolower(b);
+    };
+    return std::equal(Var.begin(), Var.end(), Matched.begin(), Matched.end(),
+                      equals);
+  }
+
+  bool match(const std::string &Var, const char *Matched) {
+    std::string Str(Matched);
+    return match(Var, Str);
+  }
+
+  bool match(const StringEnvar &Var, const char *Matched) {
+    return match(Var.get(), Matched);
+  }
+
+}; // L0OptionsTy
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
new file mode 100644
index 0000000000000..4658c1cdab1df
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -0,0 +1,136 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Plugin interface for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Defs.h"
+#include "L0Device.h"
+#include "L0Memory.h"
+#include "L0Options.h"
+#include "L0Program.h"
+#include "TLS.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Class implementing the LevelZero specific functionalities of the plugin.
+class LevelZeroPluginTy final : public GenericPluginTy {
+private:
+  /// Number of devices available including subdevices
+  uint32_t NumDevices = 0;
+
+  /// Context (and Driver) specific data
+  std::list<L0ContextTy> ContextList;
+
+  /// L0 device used by each OpenMP device
+  using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
+  DeviceContainerTy L0Devices;
+
+  // Table containing per-thread information using TLS
+  L0ThreadTblTy ThreadTLSTable;
+  // Table containing per-thread information for each device using TLS
+  L0DeviceTLSTableTy DeviceTLSTable;
+  // Table containing per-thread information for each Context using TLS
+  L0ContextTLSTableTy ContextTLSTable;
+
+  /// L0 plugin global options
+  static L0OptionsTy Options;
+
+  /// Global mutex
+  std::mutex GlobalMutex;
+
+  /// Common pool of AsyncQueue
+  AsyncQueuePoolTy AsyncQueuePool;
+
+  auto &getTLS() { return ThreadTLSTable.get(); }
+
+public:
+  LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
+  virtual ~LevelZeroPluginTy() {}
+
+  auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
+  auto &getContextTLS(ze_context_handle_t Context) {
+    return ContextTLSTable.get(Context);
+  }
+
+  static const auto &getOptions() { return Options; }
+
+  auto &getGlobalMutex() { return GlobalMutex; }
+
+  struct DevicesRangeTy {
+    using iterator = DeviceContainerTy::iterator;
+
+    iterator BeginIt;
+    iterator EndIt;
+
+    DevicesRangeTy(iterator BeginIt, iterator EndIt)
+        : BeginIt(BeginIt), EndIt(EndIt) {}
+
+    auto &begin() { return BeginIt; }
+    auto &end() { return EndIt; }
+  };
+
+  auto getDevicesRange() {
+    return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
+  }
+
+  /// Clean-up routine to be invoked by the destructor or
+  /// LevelZeroPluginTy::deinit.
+  void closeRTL();
+
+  /// Find L0 devices and initialize device properties.
+  /// Returns number of devices reported to omptarget.
+  int32_t findDevices();
+
+  L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
+    assert("Invalid device ID" && DeviceId >= 0 &&
+           DeviceId < static_cast<int32_t>(L0Devices.size()));
+    return *L0Devices[DeviceId];
+  }
+
+  uint32_t getNumRootDevices() const { return NumDevices; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    auto *Queue = getTLS().getAsyncQueue();
+    if (!Queue)
+      Queue = AsyncQueuePool.get();
+    return Queue;
+  }
+
+  void releaseAsyncQueue(AsyncQueueTy *Queue) {
+    if (!Queue)
+      return;
+    Queue->reset();
+    Queue->InUse = false;
+    if (!getTLS().releaseAsyncQueue(Queue))
+      AsyncQueuePool.release(Queue);
+  }
+
+  // Plugin interface
+
+  Expected<int32_t> initImpl() override;
+  Error deinitImpl() override;
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override;
+  GenericGlobalHandlerTy *createGlobalHandler() override;
+  uint16_t getMagicElfBits() const override;
+  Triple::ArchType getTripleArch() const override;
+  const char *getName() const override;
+  Expected<bool> isELFCompatible(uint32_t DeviceId,
+                                 StringRef Image) const override;
+
+  Error flushQueueImpl(omp_interop_val_t *Interop) override;
+  Error syncBarrierImpl(omp_interop_val_t *Interop) override;
+  Error asyncBarrierImpl(omp_interop_val_t *Interop) override;
+};
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
new file mode 100644
index 0000000000000..a548b486f4642
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -0,0 +1,135 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "L0Kernel.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+/// Program data to be initialized by plugin
+struct ProgramDataTy {
+  int Initialized = 0;
+  int NumDevices = 0;
+  int DeviceNum = -1;
+  uint32_t TotalEUs = 0;
+  uint32_t HWThreadsPerEU = 0;
+  uintptr_t DynamicMemoryLB = 0;
+  uintptr_t DynamicMemoryUB = 0;
+  int DeviceType = 0;
+  void *DynamicMemPool = nullptr;
+  int TeamsThreadLimit = 0;
+};
+
+/// Level Zero program that can contain multiple modules.
+class L0ProgramTy : public DeviceImageTy {
+  /// Handle multiple modules within a single target image
+  llvm::SmallVector<ze_module_handle_t> Modules;
+
+  /// Map of kernel names to Modules
+  std::unordered_map<std::string, ze_module_handle_t> KernelsToModuleMap;
+
+  /// List of kernels built for this image
+  /// We need to delete them ourselves as the main library is not doing
+  /// that right now
+  std::list<L0KernelTy *> Kernels;
+
+  /// Module that contains global data including device RTL
+  ze_module_handle_t GlobalModule = nullptr;
+
+  /// Requires module link
+  bool RequiresModuleLink = false;
+
+  /// Is this module library
+  bool IsLibModule = false;
+
+  /// Build a single module with the given image, build option, and format.
+  int32_t addModule(const size_t Size, const uint8_t *Image,
+                    const std::string &BuildOption, ze_module_format_t Format);
+  /// Read file and return the size of the binary if successful.
+  size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
+  int32_t readSPVFile(const char *FileName, std::vector<uint8_t> &OutSPV) const;
+  void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                        std::string &Options) const;
+
+  /// Check if the image should be handled as a library module
+  void setLibModule();
+
+  L0DeviceTy &getL0Device() const;
+
+public:
+  L0ProgramTy() = delete;
+
+  L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
+              const __tgt_device_image *Image)
+      : DeviceImageTy(ImageId, Device, Image) {}
+
+  ~L0ProgramTy();
+
+  L0ProgramTy(const L0ProgramTy &other) = delete;
+  L0ProgramTy(L0ProgramTy &&) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
+
+  static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
+    return static_cast<L0ProgramTy &>(Device);
+  }
+
+  /// Build modules from the target image description
+  int32_t buildModules(std::string &BuildOptions);
+
+  /// Link modules stored in \p Modules.
+  int32_t linkModules();
+
+  /// Loads the kernels names from all modules
+  int32_t loadModuleKernels();
+
+  /// Read data from the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+
+  /// Write data to the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t writeGlobalVariable(const char *Name, size_t Size,
+                              const void *HostPtr);
+
+  /// Looks up an OpenMP declare target global variable with the given
+  /// \p Name and \p Size in the device environment for the current device.
+  /// The lookup is first done via the device offload table. If it fails,
+  /// then the lookup falls back to non-OpenMP specific lookup on the device.
+  void *getOffloadVarDeviceAddr(const char *Name) const;
+
+  /// Returns the handle of a module that contains a given Kernel name
+  ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
+    auto K = KernelsToModuleMap.find(std::string(KernelName));
+    if (K == KernelsToModuleMap.end())
+      return nullptr;
+
+    return K->second;
+  }
+
+  void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); }
+};
+
+struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override;
+};
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
new file mode 100644
index 0000000000000..2eeae81016dee
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -0,0 +1,193 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing L0
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+#pragma once
+
+#include "Shared/Debug.h"
+#include "omptarget.h"
+#include <string>
+#include <level_zero/ze_api.h>
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define DPCALL(...)                                                            \
+  do {                                                                         \
+    if (getDebugLevel() > 1)                                                   \
+      DP(__VA_ARGS__);                                                         \
+  } while (0)
+
+#define FATAL_ERROR(Msg)                                                       \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
+    exit(EXIT_FAILURE);                                                        \
+  } while (0)
+
+#define WARNING(...)                                                           \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Warning: " __VA_ARGS__);                                  \
+  } while (0)
+
+#define INVALID_OPTION(Name, Value)                                            \
+  WARNING("Ignoring invalid option " #Name "=%s\n", Value)
+
+#define CALL_ZE(Rc, Fn, ...)                                                   \
+  do {                                                                         \
+      Rc = Fn(__VA_ARGS__);                                                    \
+  } while (0)
+
+#define CALL_ZE_RC(Rc, Fn, ...)                                                \
+  do {                                                                         \
+    CALL_ZE(Rc, Fn, __VA_ARGS__);                                              \
+    if (Rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getZeErrorName(Rc));                                                  \
+    }                                                                          \
+  } while(0)
+
+/// For non-thread-safe functions
+#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...)                                     \
+  do {                                                                         \
+    Mtx.lock();                                                                \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    Mtx.unlock();                                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+
+/// For thread-safe functions
+#define CALL_ZE_RET(Ret, Fn, ...)                                              \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR(Fn, ...)                                             \
+  CALL_ZE_RET(                                                                 \
+    Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",           \
+    STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
+
+
+
+#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...)                                     \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      const char *err_str = nullptr;                                           \
+      rc = zeDriverGetLastErrorDescription(                                    \
+          Dev.getDriverHandle(), &err_str);                                    \
+      fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn,          \
+              err_str);                                                        \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXIT_FAIL(Fn, ...)                                             \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...)                         \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__);                         \
+    if (rc != ZE_RESULT_SUCCESS)                                               \
+      return Ret;                                                              \
+  } while (0)
+
+
+#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...)                               \
+  CALL_ZE_EXT_SILENT_RET(Device,                                               \
+      Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s",          \
+			 STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__)                    
+
+#define FOREACH_ZE_ERROR_CODE(Fn)                                              \
+  Fn(ZE_RESULT_SUCCESS)                                                        \
+  Fn(ZE_RESULT_NOT_READY)                                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_LOST)                                              \
+  Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)                                       \
+  Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE)                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET)                                    \
+  Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE)                                \
+  Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)                                 \
+  Fn(ZE_RESULT_ERROR_NOT_AVAILABLE)                                            \
+  Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)                                   \
+  Fn(ZE_RESULT_WARNING_DROPPED_DATA)                                           \
+  Fn(ZE_RESULT_ERROR_UNINITIALIZED)                                            \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT)                                         \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)                                      \
+  Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_SIZE)                                             \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)                                         \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)                                  \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)                                 \
+  Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)                            \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)                                  \
+  Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)                                \
+  Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)                                      \
+  Fn(ZE_RESULT_WARNING_ACTION_REQUIRED)                                        \
+  Fn(ZE_RESULT_ERROR_UNKNOWN)
+
+#define CASE_TO_STRING(Num) case Num: return #Num;
+inline const char *getZeErrorName(int32_t Error) {
+  switch (Error) {
+    FOREACH_ZE_ERROR_CODE(CASE_TO_STRING)
+  default:
+    return "ZE_RESULT_ERROR_UNKNOWN";
+  }
+}
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
new file mode 100644
index 0000000000000..8a5f41312e129
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -0,0 +1,86 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thread Level Storage abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "AsyncQueue.h"
+#include "L0Memory.h"
+#include "L0Trace.h"
+#include "PerThreadTable.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// All thread-local data used by the Plugin
+class L0ThreadTLSTy {
+  /// Subdevice encoding
+  int64_t SubDeviceCode = 0;
+
+  /// Async info tracking
+  static constexpr int32_t PerThreadQueues = 10;
+  AsyncQueueTy AsyncQueues[PerThreadQueues];
+  int32_t UsedQueues = 0;
+
+public:
+  L0ThreadTLSTy() = default;
+  L0ThreadTLSTy(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy(L0ThreadTLSTy &&) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete;
+  ~L0ThreadTLSTy() {}
+
+  void clear() {}
+
+  int64_t getSubDeviceCode() { return SubDeviceCode; }
+
+  void setSubDeviceCode(int64_t Code) { SubDeviceCode = Code; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    AsyncQueueTy *ret = nullptr;
+    if (UsedQueues < PerThreadQueues) {
+      // there's a free queue in this thread, find it
+      for (int32_t q = 0; q < PerThreadQueues; q++) {
+        if (!AsyncQueues[q].InUse) {
+          UsedQueues++;
+          ret = &AsyncQueues[q];
+          break;
+        }
+      }
+      assert(ret && "A queue should have been found!");
+      ret->InUse = true;
+    }
+    return ret;
+  }
+
+  bool releaseAsyncQueue(AsyncQueueTy *queue) {
+    if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) {
+      // it's a local queue
+      queue->InUse = false;
+      UsedQueues--;
+      return true;
+    }
+    return false;
+  }
+};
+
+struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
+  void clear() {
+    PerThread::clear([](auto &Entry) { Entry.clear(); });
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
new file mode 100644
index 0000000000000..3f50ffd2a7260
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -0,0 +1,41 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Context.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+                         int32_t /*DriverId*/)
+    : Plugin(Plugin), zeDriver(zeDriver) {
+  CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+  DP("Driver API version is %" PRIx32 "\n", APIVersion);
+
+  ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+  CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
+
+  EventPool.init(zeContext, 0);
+  HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+}
+
+StagingBufferTy &L0ContextTy::getStagingBuffer() {
+  auto &TLS = Plugin.getContextTLS(getZeContext());
+  auto &Buffer = TLS.getStagingBuffer();
+  const auto &Options = Plugin.getOptions();
+  if (!Buffer.initialized())
+    Buffer.init(getZeContext(), Options.StagingBufferSize,
+                Options.StagingBufferCount);
+  return Buffer;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
new file mode 100644
index 0000000000000..0029d00a07685
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -0,0 +1,1065 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Device.h"
+#include "L0Defs.h"
+#include "L0Interop.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+L0DeviceTLSTy &L0DeviceTy::getTLS() {
+  return getPlugin().getDeviceTLS(getDeviceId());
+}
+
+// clang-format off
+/// Mapping from device arch to GPU runtime's device identifiers
+static struct {
+  DeviceArchTy arch;
+  PCIIdTy ids[10];
+} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::SKL,
+                       PCIIdTy::KBL,
+                       PCIIdTy::CFL, PCIIdTy::CFL_2,
+                       PCIIdTy::ICX,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::TGL, PCIIdTy::TGL_2,
+                       PCIIdTy::DG1,
+                       PCIIdTy::RKL,
+                       PCIIdTy::ADLS,
+                       PCIIdTy::RTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeLPG,
+                      {PCIIdTy::MTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPC,
+                      {PCIIdTy::PVC,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPG,
+                      {PCIIdTy::DG2_ATS_M,
+                       PCIIdTy::DG2_ATS_M_2,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2LP,
+                      {PCIIdTy::LNL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2HP,
+                      {PCIIdTy::BMG,
+                       PCIIdTy::None}},
+};
+constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]);
+// clang-format on
+
+DeviceArchTy L0DeviceTy::computeArch() const {
+  const auto PCIDeviceId = getPCIId();
+  if (PCIDeviceId != 0) {
+    for (int arch = 0; arch < DeviceArchMapSize; arch++) {
+      for (int i = 0;; i++) {
+        const auto Id = DeviceArchMap[arch].ids[i];
+        if (Id == PCIIdTy::None)
+          break;
+
+        auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
+        if (maskedId == Id)
+          return DeviceArchMap[arch].arch; // Exact match or prefix match
+      }
+    }
+  }
+
+  DP("Warning: Cannot decide device arch for %s.\n", getNameCStr());
+  return DeviceArchTy::DeviceArch_None;
+}
+
+bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const {
+  ze_device_ip_version_ext_t IPVersion{};
+  IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+  IPVersion.pNext = nullptr;
+  ze_device_properties_t DevicePR{};
+  DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  DevicePR.pNext = &IPVersion;
+  CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR);
+  return IPVersion.ipVersion >= Version;
+}
+
+/// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findComputeOrdinal() {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+  for (uint32_t I = 0; I < Count; I++) {
+    // TODO: add a separate set of ordinals for compute queue groups which
+    // support cooperative kernels
+    if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      Ordinal.first = I;
+      Ordinal.second = Properties[I].numQueues;
+      break;
+    }
+  }
+  if (Ordinal.first == UINT32_MAX)
+    DP("Error: no command queues are found\n");
+
+  return Ordinal;
+}
+
+/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+
+  for (uint32_t I = 0; I < Count; I++) {
+    const auto &Flags = Properties[I].flags;
+    if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+        (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) {
+      auto NumQueues = Properties[I].numQueues;
+      if (LinkCopy && NumQueues > 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found link copy command queue for device " DPxMOD
+           ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n",
+           DPxPTR(zeDevice), Ordinal.first, Ordinal.second);
+        break;
+      } else if (!LinkCopy && NumQueues == 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32
+           "\n",
+           DPxPTR(zeDevice), Ordinal.first);
+        break;
+      }
+    }
+  }
+  return Ordinal;
+}
+
+void L0DeviceTy::reportDeviceInfo() const {
+  DP("Device %" PRIu32 "\n", DeviceId);
+  DP("-- Name                         : %s\n", getNameCStr());
+  DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
+  DP("-- UUID                         : %s\n", getUuid().c_str());
+  DP("-- Number of total EUs          : %" PRIu32 "\n", getNumEUs());
+  DP("-- Number of threads per EU     : %" PRIu32 "\n", getNumThreadsPerEU());
+  DP("-- EU SIMD width                : %" PRIu32 "\n", getSIMDWidth());
+  DP("-- Number of EUs per subslice   : %" PRIu32 "\n", getNumEUsPerSubslice());
+  DP("-- Number of subslices per slice: %" PRIu32 "\n",
+     getNumSubslicesPerSlice());
+  DP("-- Number of slices             : %" PRIu32 "\n", getNumSlices());
+  DP("-- Local memory size (bytes)    : %" PRIu32 "\n",
+     getMaxSharedLocalMemory());
+  DP("-- Global memory size (bytes)   : %" PRIu64 "\n", getGlobalMemorySize());
+  DP("-- Cache size (bytes)           : %" PRIu64 "\n", getCacheSize());
+  DP("-- Max clock frequency (MHz)    : %" PRIu32 "\n", getClockRate());
+}
+
+Error L0DeviceTy::internalInit() {
+  const auto &Options = getPlugin().getOptions();
+
+  uint32_t Count = 1;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count,
+                    &MemoryProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count,
+                    &CacheProperties);
+
+  DeviceName =
+      std::string(DeviceProperties.name, sizeof(DeviceProperties.name));
+
+  DP("Found a GPU device, Name = %s\n", DeviceProperties.name);
+
+  DeviceArch = computeArch();
+  // Default allocation kind for this device
+  AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED;
+
+  ze_kernel_indirect_access_flags_t Flags =
+      (AllocKind == TARGET_ALLOC_DEVICE)
+          ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
+          : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+  IndirectAccessFlags = Flags;
+
+  // Get the UUID
+  std::string uid = "";
+  for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++)
+    uid += std::to_string(DeviceProperties.uuid.id[n]);
+  DeviceUuid = std::move(uid);
+
+  ComputeOrdinal = findComputeOrdinal();
+
+  CopyOrdinal = findCopyOrdinal();
+
+  LinkCopyOrdinal = findCopyOrdinal(true);
+  IsAsyncEnabled =
+      isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
+  MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+  l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                bool ReleaseQueue) {
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (!IsAsync)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+
+  if (!AsyncQueue->WaitEvents.empty()) {
+    const auto &WaitEvents = AsyncQueue->WaitEvents;
+    if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
+      // Only need to wait for the last event
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+      // Synchronize on kernel event to support printf()
+      auto KE = AsyncQueue->KernelEvent;
+      if (KE && KE != WaitEvents.back()) {
+        CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+      }
+      for (auto &Event : WaitEvents) {
+        releaseEvent(Event);
+      }
+    } else { // Async
+      // Wait for all events. We should wait and reset events in reverse order
+      // to avoid premature event reset. If we have a kernel event in the
+      // queue, it is the last event to wait for since all wait events of the
+      // kernel are signaled before the kernel is invoked. We always invoke
+      // synchronization on kernel event to support printf().
+      bool WaitDone = false;
+      for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
+        if (!WaitDone) {
+          CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+          if (*Itr == AsyncQueue->KernelEvent)
+            WaitDone = true;
+        }
+        releaseEvent(*Itr);
+      }
+    }
+  }
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  if (ReleaseQueue) {
+    Plugin.releaseAsyncQueue(AsyncQueue);
+    getStagingBuffer().reset();
+    AsyncInfo->Queue = nullptr;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                               __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  const auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
+      TgtPtrType == ZE_MEMORY_TYPE_HOST) {
+    std::copy_n(static_cast<const char *>(HstPtr), Size,
+                static_cast<char *>(TgtPtr));
+  } else {
+    const void *SrcPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      SrcPtr = getStagingBuffer().get(IsAsync);
+      std::copy_n(static_cast<const char *>(HstPtr), Size,
+                  static_cast<char *>(const_cast<void *>(SrcPtr)));
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
+    else
+      RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
+       DPxPTR(TgtPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                                 __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = Plugin.getAsyncQueue();
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
+  auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
+      TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
+    bool CopyNow = true;
+    if (IsAsync) {
+      if (AsyncQueue->KernelEvent) {
+        // Delay Host/Shared USM to host memory copy since it must wait for
+        // kernel completion.
+        AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size);
+        CopyNow = false;
+      }
+    }
+    if (CopyNow) {
+      std::copy_n(static_cast<const char *>(TgtPtr), Size,
+                  static_cast<char *>(HstPtr));
+    }
+  } else {
+    void *DstPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <=
+            getPlugin().getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      DstPtr = getStagingBuffer().get(IsAsync);
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+                               /* CopyTo */ false);
+    else
+      RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+    if (DstPtr != HstPtr) {
+      if (IsAsync) {
+        // Store delayed H2M data copies
+        auto &H2MList = AsyncQueue->H2MList;
+        H2MList.emplace_back(DstPtr, HstPtr, static_cast<size_t>(Size));
+      } else {
+        std::copy_n(static_cast<char *>(DstPtr), Size,
+                    static_cast<char *>(HstPtr));
+      }
+    }
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
+       DPxPTR(HstPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(const __tgt_device_image *TgtImage,
+                           int32_t ImageId) {
+  auto *PGM = getProgramFromImage(TgtImage);
+  if (PGM) {
+    // Program already exists
+    return PGM;
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+       DPxPTR(TgtImage->ImageStart));
+
+  const size_t NumEntries =
+      (size_t)(TgtImage->EntriesEnd - TgtImage->EntriesBegin);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Expecting to have %zu entries defined\n", NumEntries);
+  (void)NumEntries; // silence warning
+
+  const auto &Options = getPlugin().getOptions();
+  std::string CompilationOptions(Options.CompilationOptions + " " +
+                                 Options.UserCompilationOptions);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+  CompilationOptions += " " + Options.InternalCompilationOptions;
+  auto &Program = addProgram(ImageId, TgtImage);
+
+  int32_t RC = Program.buildModules(CompilationOptions);
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildModules %d", RC);
+
+  RC = Program.linkModules();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in linkModules %d", RC);
+
+  RC = Program.loadModuleKernels();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildKernels %d", RC);
+
+  return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+  // Ignoring for now
+  // TODO: call properly L0Program unload
+  return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                  bool ReleaseQueue) {
+  if (!ReleaseQueue) {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Support for ReleaseQueue=false in %s"
+                         " not implemented yet\n",
+                         __func__);
+  }
+  int32_t RC = synchronize(&AsyncInfo);
+  return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return false;
+
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (AsyncQueue->WaitEvents.empty())
+    return false;
+
+  return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return Plugin::success();
+
+  auto &Plugin = getPlugin();
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (!AsyncQueue->WaitEvents.empty())
+    return Plugin::success();
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  Plugin.releaseAsyncQueue(AsyncQueue);
+  getStagingBuffer().reset();
+  AsyncInfo.Queue = nullptr;
+
+  return Plugin::success();
+}
+
+void *L0DeviceTy::allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) {
+  return dataAlloc(Size, /*Align=*/0, Kind,
+                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+                   /*DevMalloc=*/false);
+}
+
+int L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+  return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
+}
+
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                   int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
+}
+
+Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                   void *DstPtr, int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+
+  L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev);
+  // Use copy engine only for across-tile/device copies.
+  const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
+
+  if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
+    if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+                            (__tgt_async_info *)AsyncInfoWrapper))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  } else {
+    if (enqueueMemCopy(DstPtr, SrcPtr, Size,
+                       /* AsyncInfo */ nullptr,
+                       /* Locked */ false, UseCopyEngine))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs<AsyncQueueTy *>();
+  if (!Queue) {
+    Queue = getPlugin().getAsyncQueue();
+    AsyncInfoWrapper.setQueueAs<AsyncQueueTy *>(Queue);
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
+  if (!Info->Context)
+    Info->Context = getZeContext();
+  if (!Info->Device)
+    Info->Device = reinterpret_cast<void *>(getZeDevice());
+  return Plugin::success();
+}
+
+Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
+  InfoTreeNode Info;
+  Info.add("Device Number", getDeviceId());
+  Info.add("Device Name", getNameCStr());
+  Info.add("Device PCI ID", getPCIId());
+  Info.add("Device UUID", getUuid().c_str());
+  Info.add("Number of total EUs", getNumEUs());
+  Info.add("Number of threads per EU", getNumThreadsPerEU());
+  Info.add("EU SIMD width", getSIMDWidth());
+  Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
+  Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
+  Info.add("Number of slices", getNumSlices());
+  Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
+  Info.add("Global memory size (bytes)", getGlobalMemorySize());
+  Info.add("Cache size (bytes)", getCacheSize());
+  Info.add("Max clock frequency (MHz)", getClockRate());
+  return Info;
+}
+
+Expected<GenericKernelTy &> L0DeviceTy::constructKernel(const char *Name) {
+  // Allocate and construct the L0 kernel.
+  L0KernelTy *L0Kernel = getPlugin().allocate<L0KernelTy>();
+  if (!L0Kernel)
+    return Plugin::error(ErrorCode::UNKNOWN,
+                         "Failed to allocate memory for L0 kernel");
+
+  new (L0Kernel) L0KernelTy(Name);
+
+  return *L0Kernel;
+}
+
+uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
+  ze_memory_allocation_properties_t properties = {
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+      nullptr,                // extension
+      ZE_MEMORY_TYPE_UNKNOWN, // type
+      0,                      // id
+      0,                      // page size
+  };
+
+  ze_result_t rc;
+  CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties,
+          nullptr);
+
+  if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT)
+    return ZE_MEMORY_TYPE_UNKNOWN;
+  else
+    return properties.type;
+}
+
+interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
+                                                   int32_t NumPrefers,
+                                                   interop_spec_t *Prefers) {
+  // no supported preference found, set default to level_zero, non-ordered
+  return interop_spec_t{
+      tgt_fr_level_zero, {forceInorderInterop() /*inorder*/, 0}, 0};
+}
+
+Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
+                                                 interop_spec_t &InteropSpec) {
+  auto Ret =
+      new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext);
+  Ret->fr_id = tgt_fr_level_zero;
+  Ret->vendor_id = omp_vendor_intel;
+
+  if (InteropContext == kmp_interop_type_target ||
+      InteropContext == kmp_interop_type_targetsync) {
+    Ret->device_info.Platform = getZeDriver();
+    Ret->device_info.Device = getZeDevice();
+    Ret->device_info.Context = getZeContext();
+  }
+
+  Ret->rtl_property = new L0Interop::Property();
+  if (InteropContext == kmp_interop_type_targetsync) {
+    Ret->async_info = new __tgt_async_info();
+    auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
+
+    bool InOrder = InteropSpec.attrs.inorder;
+    Ret->attrs.inorder = InOrder;
+    if (useImmForInterop()) {
+      auto CmdList = createImmCmdList(InOrder);
+      Ret->async_info->Queue = CmdList;
+      L0->ImmCmdList = CmdList;
+    } else {
+      Ret->async_info->Queue = createCommandQueue(InOrder);
+      L0->CommandQueue =
+          static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
+    }
+  }
+
+  return Ret;
+}
+
+Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
+  const auto DeviceId = getDeviceId();
+
+  if (!Interop || Interop->device_id != (intptr_t)DeviceId) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  if (Interop->async_info && Interop->async_info->Queue) {
+    if (useImmForInterop()) {
+      auto ImmCmdList = L0->ImmCmdList;
+      CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList);
+    } else {
+      auto CmdQueue = L0->CommandQueue;
+      CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue);
+    }
+  }
+  delete L0;
+  delete Interop;
+
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                                   __tgt_async_info *AsyncInfo, bool Locked,
+                                   bool UseCopyEngine) {
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  ze_event_handle_t Event = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    if (UseCopyEngine) {
+      CmdList = getCopyCmdList();
+      CmdQueue = getCopyCmdQueue();
+    } else {
+      CmdList = getCmdList();
+      CmdQueue = getCmdQueue();
+    }
+
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    if (Locked) {
+      CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                       nullptr);
+    } else {
+      CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                           CmdQueue, 1, &CmdList, nullptr);
+    }
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue non-blocking memory copy. This function is invoked only when IMM is
+/// fully enabled and async mode is requested.
+int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                                        __tgt_async_info *AsyncInfo,
+                                        bool CopyTo) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+  ze_event_handle_t SignalEvent = getEvent();
+  size_t NumWaitEvents = 0;
+  ze_event_handle_t *WaitEvents = nullptr;
+  AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
+  if (!AsyncQueue->WaitEvents.empty()) {
+    // Use a single wait event if events are ordered or a kernel event exists.
+    NumWaitEvents = 1;
+    if (Ordered)
+      WaitEvents = &AsyncQueue->WaitEvents.back();
+    else if (AsyncQueue->KernelEvent)
+      WaitEvents = &AsyncQueue->KernelEvent;
+    else
+      NumWaitEvents = 0;
+  }
+  auto CmdList = getImmCopyCmdList();
+  CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                   SignalEvent, NumWaitEvents, WaitEvents);
+  AsyncQueue->WaitEvents.push_back(SignalEvent);
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue memory fill
+int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+                                   size_t PatternSize, size_t Size) {
+  if (useImmForCopy()) {
+    const auto CmdList = getImmCopyCmdList();
+    auto Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    auto CmdList = getCopyCmdList();
+    const auto CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                     nullptr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                               int64_t PatternSize, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  // TODO: support async version
+  // TODO: convert enqueueMemFill to return Error code
+  if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
+    return Plugin::success();
+
+  return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+}
+
+void *L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+
+  const bool UseDedicatedPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  if (Kind == TARGET_ALLOC_DEFAULT) {
+    if (UserAlloc)
+      Kind = TARGET_ALLOC_DEVICE;
+    else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM)
+      Kind = TARGET_ALLOC_HOST;
+    else if (UseDedicatedPool)
+      Kind = TARGET_ALLOC_DEVICE;
+    else
+      Kind = getAllocKind();
+  }
+  auto &Allocator = getMemAllocator(Kind);
+  return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+                         MemAdvice, AllocOpt);
+}
+
+int32_t L0DeviceTy::dataDelete(void *Ptr) {
+  auto &Allocator = getMemAllocator(Ptr);
+  return Allocator.dealloc(Ptr);
+}
+
+int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+  ze_result_t RC;
+  CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
+          Size);
+  if (RC != ZE_RESULT_SUCCESS) {
+    DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
+       ".\n",
+       DPxPTR(Mem), DPxPTR(getZeDevice()));
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Command queues related functions
+/// Create a command list with given ordinal and flags
+ze_command_list_handle_t L0DeviceTy::createCmdList(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    ze_command_list_flags_t Flags, const std::string &DeviceIdStr) {
+  ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+                                        nullptr, // extension
+                                        Ordinal, Flags};
+  ze_command_list_handle_t cmdList;
+  CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
+                   &cmdList);
+  DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
+     ") for device %s.\n",
+     DPxPTR(cmdList), Ordinal, DeviceIdStr.c_str());
+  return cmdList;
+}
+
+/// Create a command list with default flags
+ze_command_list_handle_t
+L0DeviceTy::createCmdList(ze_context_handle_t Context,
+                          ze_device_handle_t Device, uint32_t Ordinal,
+                          const std::string &DeviceIdStr) {
+  return (Ordinal == UINT32_MAX)
+             ? nullptr
+             : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
+}
+
+ze_command_list_handle_t L0DeviceTy::getCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getCmdList();
+  if (!CmdList) {
+    CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
+                            getZeId());
+    TLS.setCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+/// Create a command queue with given ordinal and flags
+ze_command_queue_handle_t
+L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
+                           ze_device_handle_t Device, uint32_t Ordinal,
+                           uint32_t Index, ze_command_queue_flags_t Flags,
+                           const std::string &DeviceIdStr) {
+  ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                                          nullptr, // extension
+                                          Ordinal,
+                                          Index,
+                                          Flags, // flags
+                                          ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                                          ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_queue_handle_t cmdQueue;
+  CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+                   &cmdQueue);
+  DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
+     ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.c_str());
+  return cmdQueue;
+}
+
+/// Create a command queue with default flags
+ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    uint32_t Index, const std::string &DeviceIdStr, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  return (Ordinal == UINT32_MAX) ? nullptr
+                                 : createCmdQueue(Context, Device, Ordinal,
+                                                  Index, Flags, DeviceIdStr);
+}
+
+/// Create a new command queue for the given OpenMP device ID
+ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+  auto cmdQueue =
+      createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
+                     getComputeIndex(), getZeId(), InOrder);
+  return cmdQueue;
+}
+
+/// Create an immediate command list
+ze_command_list_handle_t
+L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                               nullptr,
+                               Ordinal,
+                               Index,
+                               Flags,
+                               ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                               ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_list_handle_t CmdList = nullptr;
+  CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+                   &Desc, &CmdList);
+  DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
+     ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
+  return CmdList;
+}
+
+/// Create an immediate command list for copying
+ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+  uint32_t Ordinal = getMainCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getLinkCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getComputeEngine();
+  return createImmCmdList(Ordinal, /*Index*/ 0);
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+  auto &TLS = getTLS();
+  auto CmdQueue = TLS.getCmdQueue();
+  if (!CmdQueue) {
+    CmdQueue = createCommandQueue();
+    TLS.setCmdQueue(CmdQueue);
+  }
+  return CmdQueue;
+}
+
+ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getCopyCmdList();
+    if (!CmdList) {
+      CmdList = createCmdList(getZeContext(), getZeDevice(),
+                              getMainCopyEngine(), getZeId());
+      TLS.setCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getCopyCmdQueue();
+    if (!CmdQueue) {
+      CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
+                                getMainCopyEngine(), 0, getZeId());
+      TLS.setCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getLinkCopyCmdList();
+    if (!CmdList) {
+      CmdList =
+          createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                        ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getLinkCopyCmdQueue();
+    if (!CmdQueue) {
+      // Try to use different copy engines for multiple threads
+      uint32_t Index =
+          __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
+      CmdQueue =
+          createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                         Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCmdList();
+  if (!CmdList) {
+    CmdList = createImmCmdList();
+    TLS.setImmCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCopyCmdList();
+  if (!CmdList) {
+    CmdList = createImmCopyCmdList();
+    TLS.setImmCopyCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+
+  // Nothing to do if everything is ordered
+  if (Ordered)
+    return Plugin::success();
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = getImmCopyCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+  } else {
+    CmdList = getCopyCmdList();
+    CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
new file mode 100644
index 0000000000000..06f01f23285fc
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -0,0 +1,134 @@
+//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===//
+//
+// Implement wrapper for level_zero API calls through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <memory>
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(zeInit, 1)
+DLWRAP(zeDriverGet, 2)
+DLWRAP(zeDeviceGet, 3)
+DLWRAP(zeDeviceGetSubDevices, 3)
+DLWRAP(zeModuleCreate, 5)
+DLWRAP(zeModuleGetProperties, 2)
+DLWRAP(zeModuleBuildLogDestroy, 1)
+DLWRAP(zeModuleBuildLogGetString, 3)
+DLWRAP(zeModuleGetKernelNames, 3)
+DLWRAP(zeModuleDestroy, 1)
+DLWRAP(zeCommandListAppendBarrier, 4)
+DLWRAP(zeCommandListAppendLaunchKernel, 6)
+DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6)
+DLWRAP(zeCommandListAppendMemoryCopy, 7)
+DLWRAP(zeCommandListAppendMemoryCopyRegion, 12)
+DLWRAP(zeCommandListAppendMemoryFill, 8)
+DLWRAP(zeCommandListAppendMemoryPrefetch, 3)
+DLWRAP(zeCommandListAppendMemAdvise, 5)
+DLWRAP(zeCommandListClose, 1)
+DLWRAP(zeCommandListCreate, 4)
+DLWRAP(zeCommandListCreateImmediate, 4)
+DLWRAP(zeCommandListDestroy, 1)
+DLWRAP(zeCommandListReset, 1)
+DLWRAP(zeCommandQueueCreate, 4)
+DLWRAP(zeCommandQueueDestroy, 1)
+DLWRAP(zeCommandQueueExecuteCommandLists, 4)
+DLWRAP(zeCommandQueueSynchronize, 2)
+DLWRAP(zeContextCreate, 3)
+DLWRAP(zeContextDestroy, 1)
+DLWRAP(zeContextMakeMemoryResident, 4)
+DLWRAP(zeDeviceCanAccessPeer, 3)
+DLWRAP(zeDeviceGetProperties, 2)
+DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3)
+DLWRAP(zeDeviceGetComputeProperties, 2)
+DLWRAP(zeDeviceGetMemoryProperties, 3)
+DLWRAP(zeDeviceGetCacheProperties, 3)
+DLWRAP(zeDeviceGetGlobalTimestamps, 3)
+DLWRAP(zeDriverGetApiVersion, 2)
+DLWRAP(zeDriverGetExtensionFunctionAddress, 3)
+DLWRAP(zeDriverGetExtensionProperties, 3)
+DLWRAP(zeEventCreate, 3)
+DLWRAP(zeEventDestroy, 1)
+DLWRAP(zeEventHostReset, 1)
+DLWRAP(zeEventHostSynchronize, 2)
+DLWRAP(zeEventPoolCreate, 5)
+DLWRAP(zeEventPoolDestroy, 1)
+DLWRAP(zeEventQueryKernelTimestamp, 2)
+DLWRAP(zeFenceCreate, 3)
+DLWRAP(zeFenceDestroy, 1)
+DLWRAP(zeFenceHostSynchronize, 2)
+DLWRAP(zeKernelCreate, 3)
+DLWRAP(zeKernelDestroy, 1)
+DLWRAP(zeKernelGetName, 3)
+DLWRAP(zeKernelGetProperties, 2)
+DLWRAP(zeKernelSetArgumentValue, 4)
+DLWRAP(zeKernelSetGroupSize, 4)
+DLWRAP(zeKernelSetIndirectAccess, 2)
+DLWRAP(zeKernelSuggestGroupSize, 7)
+DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2)
+DLWRAP(zeMemAllocDevice, 6)
+DLWRAP(zeMemAllocHost, 5)
+DLWRAP(zeMemAllocShared, 7)
+DLWRAP(zeMemFree, 2)
+DLWRAP(zeMemGetAddressRange, 4)
+DLWRAP(zeMemGetAllocProperties, 4)
+DLWRAP(zeModuleDynamicLink, 3)
+DLWRAP(zeModuleGetGlobalPointer, 4)
+DLWRAP(zesDeviceEnumMemoryModules, 3)
+DLWRAP(zesMemoryGetState, 2)
+
+DLWRAP_FINALIZE()
+
+#ifndef LEVEL_ZERO_LIBRARY
+#error "Level zero library not defined"
+#endif
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadLevelZero() {
+  const char *L0Library = LEVEL_ZERO_LIBRARY;
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", L0Library);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, L0Library);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) {
+  if (!loadLevelZero())
+    return ZE_RESULT_ERROR_UNKNOWN;
+  return dlwrap_zeInit(flags);
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
new file mode 100644
index 0000000000000..d1cb0b7bd50bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -0,0 +1,649 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                             KernelArgsTy &KernelArgs,
+                             KernelLaunchParamsTy LaunchParams,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+  int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+                                   std::move(LaunchParams), AsyncInfoWrapper);
+  if (RC == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  return Plugin::error(error::ErrorCode::UNKNOWN,
+                       "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+  const auto *KernelName = getName();
+
+  auto Module = Program.findModuleFromKernelName(KernelName);
+  ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+                                 KernelName};
+  CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+  return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+                           DeviceImageTy &Image) {
+  auto &Program = L0ProgramTy::makeL0Program(Image);
+
+  Error Err = buildKernel(Program);
+  if (Err)
+    return Err;
+  Program.addKernel(this);
+
+  return Plugin::success();
+}
+
+/// Read global thread limit and max teams from the host runtime. These values
+/// are subject to change at any program point, so every kernel execution
+/// needs to read the most recent values.
+static std::tuple<int32_t, int32_t> readTeamsThreadLimit() {
+  int ThrLimit;
+  ThrLimit = omp_get_teams_thread_limit();
+  DP("omp_get_teams_thread_limit() returned %" PRId32 "\n", ThrLimit);
+  // omp_get_thread_limit() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t ThreadLimit =
+      (ThrLimit > 0 && ThrLimit != (std::numeric_limits<int32_t>::max)())
+          ? ThrLimit
+          : 0;
+
+  int NTeams = omp_get_max_teams();
+  DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+  // omp_get_max_teams() would return INT_MAX by default.
+  // NOTE: Windows.h defines max() macro, so we have to guard
+  //       the call with parentheses.
+  int32_t NumTeams =
+      (NTeams > 0 && NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams
+                                                                      : 0;
+
+  return {NumTeams, ThreadLimit};
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+    TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+    ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool IsTeamsNDRange) const {
+
+  const KernelPropertiesTy &KernelPR = getProperties();
+
+  const auto DeviceId = Device.getDeviceId();
+  bool MaxGroupSizeForced = false;
+  bool MaxGroupCountForced = false;
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+  const auto &Option = LevelZeroPluginTy::getOptions();
+  const auto OptSubscRate = Option.SubscriptionRate;
+
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+  if (KernelMaxThreadGroupSize < MaxGroupSize) {
+    MaxGroupSize = KernelMaxThreadGroupSize;
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Capping maximum team size to %" PRIu32
+         " due to kernel constraints.\n",
+         MaxGroupSize);
+  }
+
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t MaxGroupCount = 0;
+  if (NumTeams > 0) {
+    MaxGroupCount = NumTeams;
+    MaxGroupCountForced = true;
+  }
+
+  if (MaxGroupCountForced) {
+    // If number of teams is specified by the user, then use KernelWidth
+    // WIs per WG by default, so that it matches
+    // decideLoopKernelGroupArguments() behavior.
+    if (!MaxGroupSizeForced) {
+      MaxGroupSize = KernelWidth;
+    }
+  } else {
+    const uint32_t NumSubslices = Device.getNumSubslices();
+    uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+    if (HalfNumThreads)
+      NumThreadsPerSubslice /= 2;
+
+    MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+    if (MaxGroupSizeForced) {
+      // Set group size for the HW capacity
+      uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+      uint32_t NumGroupsPerSubslice =
+          (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+      MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+    } else {
+      assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+      assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+             "Invalid maxGroupSize");
+      // Maximize group size
+      while (MaxGroupSize >= KernelWidth) {
+        uint32_t NumThreadsPerGroup =
+            (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+        if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+          uint32_t NumGroupsPerSubslice =
+              NumThreadsPerSubslice / NumThreadsPerGroup;
+          MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+          break;
+        }
+        MaxGroupSize -= KernelWidth;
+      }
+    }
+  }
+
+  uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  bool UsedReductionSubscriptionRate = false;
+  if (!MaxGroupCountForced) {
+    { GRPCounts[0] *= OptSubscRate; }
+
+    size_t LoopTripcount = 0;
+    if (LoopLevels) {
+      // TODO: consider other possible LoopDesc uses
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Loop desciptor provided but specific ND-range is disabled\n");
+      // TODO: get rid of this constraint
+      if (LoopLevels->NumLoops > 1) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+             LoopLevels->NumLoops);
+      } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+        LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+                         LoopLevels->Levels[0].Stride) /
+                        LoopLevels->Levels[0].Stride;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+             " = %zu\n",
+             LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+             LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+             LoopTripcount);
+      }
+    }
+
+    if (LoopTripcount && !UsedReductionSubscriptionRate) {
+      const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+                                     Device.getNumSubslices() * SIMDWidth;
+      size_t AdjustedGroupCount =
+          IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+                                      MaxTotalThreads / GRPSizes[0])
+                         : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+      AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+      AdjustedGroupCount *= OptSubscRate;
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Adjusting number of teams using the loop tripcount\n");
+      if (AdjustedGroupCount < GRPCounts[0])
+        GRPCounts[0] = AdjustedGroupCount;
+    }
+  }
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const size_t (&TripCounts)[3],
+                                     const uint32_t (&GroupSizes)[3],
+                                     uint32_t SIMDWidth) {
+  uint64_t GroupCount[3];
+  for (int I = 0; I < 3; ++I) {
+    if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[I] =
+        (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+    if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+      return (std::numeric_limits<uint64_t>::max)();
+  }
+  for (int I = 1; I < 3; ++I) {
+    if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[0] *= GroupCount[I];
+  }
+  // Multiplication of the group sizes must never overflow uint64_t
+  // for any existing device.
+  uint64_t LocalWorkSize =
+      uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+  uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+  // Check that the total number of threads fits uint64_t.
+  if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+    return (std::numeric_limits<uint64_t>::max)();
+
+  return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+    uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool &AllowCooperative) const {
+
+  const auto DeviceId = Device.getDeviceId();
+  const auto &Options = LevelZeroPluginTy::getOptions();
+  const auto &KernelPR = getProperties();
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+  bool MaxGroupSizeForced = false;
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t GRPCounts[3] = {1, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  TgtLoopDescTy *Levels = LoopLevels->Levels;
+  int32_t DistributeDim = LoopLevels->DistributeDim;
+  assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+         "Invalid distribute dimension.");
+  int32_t NumLoops = LoopLevels->NumLoops;
+  assert((NumLoops > 0 && NumLoops <= 3) &&
+         "Invalid loop nest description for ND partitioning");
+
+  // Compute global widths for X/Y/Z dimensions.
+  size_t TripCounts[3] = {1, 1, 1};
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+         ", Stride = %" PRId64 "\n",
+         I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+    if (Levels[I].Ub < Levels[I].Lb)
+      TripCounts[I] = 0;
+    else
+      TripCounts[I] =
+          (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+  }
+
+  // Check if any of the loop has zero iterations.
+  if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+    std::fill(GroupSizes, GroupSizes + 3, 1);
+    std::fill(GRPCounts, GRPCounts + 3, 1);
+    if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+      // There is a distribute dimension, and the distribute loop
+      // has non-zero iterations, but some inner parallel loop
+      // has zero iterations. We still want to split the distribute
+      // loop's iterations between many WGs (of size 1), but the inner/lower
+      // dimensions should be 1x1.
+      // Note that this code is currently dead, because we are not
+      // hoisting the inner loops' bounds outside of the target regions.
+      // The code is here just for completeness.
+      size_t DistributeTripCount = TripCounts[DistributeDim];
+      if (DistributeTripCount > UINT32_MAX) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Invalid number of teams %zu due to large loop trip count\n",
+             DistributeTripCount);
+        return OFFLOAD_FAIL;
+      }
+      GRPCounts[DistributeDim] = DistributeTripCount;
+    }
+    AllowCooperative = false;
+    GroupCounts.groupCountX = GRPCounts[0];
+    GroupCounts.groupCountY = GRPCounts[1];
+    GroupCounts.groupCountZ = GRPCounts[2];
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (!MaxGroupSizeForced) {
+    // Use zeKernelSuggestGroupSize to compute group sizes,
+    // or fallback to setting dimension 0 width to SIMDWidth.
+    // Note that in case of user-specified LWS GRPSizes[0]
+    // is already set according to the specified value.
+    size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+    if (DistributeDim > 0) {
+      // There is a distribute dimension.
+      GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+      GlobalSizes[DistributeDim] = 1;
+    }
+
+    {
+      if (MaxGroupSize > KernelPR.Width) {
+        GRPSizes[0] = KernelPR.Width;
+      }
+      if (DistributeDim == 0) {
+        // If there is a distribute dimension, then we do not use
+        // thin HW threads, since we do not know anything about
+        // the iteration space of the inner parallel loop regions.
+        //
+        // If there is no distribute dimension, then try to use thiner
+        // HW threads to get more independent HW threads executing
+        // the kernel - this may allow more parallelism due to
+        // the stalls being distributed across multiple HW threads rather
+        // than across SIMD lanes within one HW thread.
+        assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+               "Unexpected team sizes for dimensions 1 or/and 2.");
+        uint32_t SimdWidth = KernelPR.SIMDWidth;
+        uint64_t TotalThreads = Device.getTotalThreads();
+        TotalThreads *= Options.ThinThreadsThreshold;
+
+        uint32_t GRPSizePrev = GRPSizes[0];
+        uint64_t ThreadsNeeded =
+            computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        while (ThreadsNeeded < TotalThreads) {
+          GRPSizePrev = GRPSizes[0];
+          // Try to half the local work size (if possible) and see
+          // how many HW threads the kernel will require with this
+          // new local work size.
+          // In most implementations the initial GRPSizes[0]
+          // will be a power-of-two.
+          if (GRPSizes[0] <= 1)
+            break;
+          GRPSizes[0] >>= 1;
+          ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        }
+        GRPSizes[0] = GRPSizePrev;
+      }
+    }
+  }
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    if (I < DistributeDim) {
+      GRPCounts[I] = 1;
+      continue;
+    }
+    size_t Trip = TripCounts[I];
+    if (GRPSizes[I] >= Trip)
+      GRPSizes[I] = Trip;
+    size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+    if (Count > UINT32_MAX) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Invalid number of teams %zu due to large loop trip count\n", Count);
+      return OFFLOAD_FAIL;
+    }
+    GRPCounts[I] = (uint32_t)Count;
+  }
+  AllowCooperative = false;
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &SubDevice, int32_t NumTeams,
+                                   int32_t ThreadLimit, uint32_t *GroupSizes,
+                                   ze_group_count_t &GroupCounts,
+                                   void *LoopDesc,
+                                   bool &AllowCooperative) const {
+
+  const auto SubId = SubDevice.getDeviceId();
+  const auto &KernelPR = getProperties();
+
+  // Detect if we need to reduce available HW threads. We need this adjustment
+  // on XeHPG when L0 debug is enabled (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  static std::once_flag OnceFlag;
+  static bool ZeDebugEnabled = false;
+  std::call_once(OnceFlag, []() {
+    const char *EnvVal = std::getenv("ZET_ENABLE_PROGRAM_DEBUGGING");
+    if (EnvVal && std::atoi(EnvVal) == 1)
+      ZeDebugEnabled = true;
+  });
+
+  // Read the most recent global thread limit and max teams.
+  auto [NumTeamsICV, ThreadLimitICV] = readTeamsThreadLimit();
+
+  bool IsXeHPG = SubDevice.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+  bool HalfNumThreads = ZeDebugEnabled && IsXeHPG;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+       "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+  assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+  if (ThreadLimit > 0) {
+    // use thread_limit clause value default
+    DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+       ThreadLimit);
+  } else if (ThreadLimitICV > 0) {
+    // else use thread-limit-var ICV
+    ThreadLimit = ThreadLimitICV;
+    DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+  }
+
+  size_t MaxThreadLimit = SubDevice.getMaxGroupSize();
+  // Set correct max group size if the kernel was compiled with explicit SIMD
+  if (SIMDWidth == 1) {
+    MaxThreadLimit = SubDevice.getNumThreadsPerSubslice();
+  }
+
+  if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+    MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+    DP("Capping maximum team size to %zu due to kernel constraints.\n",
+       MaxThreadLimit);
+  }
+
+  if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+    ThreadLimit = MaxThreadLimit;
+    DP("Max team size execceds current maximum %zu. Adjusted\n",
+       MaxThreadLimit);
+  }
+  {
+    if (NumTeams > 0) {
+      DP("Number of teams is set to %" PRId32
+         "(num_teams clause or no teams construct)\n",
+         NumTeams);
+    } else if (NumTeamsICV > 0) {
+      // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, SubId,
+           "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+      NumTeams = NumTeamsICV;
+      DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+         NumTeams);
+    }
+
+    bool UseLoopTC = LoopDesc;
+    decideKernelGroupArguments(
+        SubDevice, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+        UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+        GroupCounts, HalfNumThreads, false);
+    AllowCooperative = false;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+                                        KernelArgsTy &KernelArgs,
+                                        KernelLaunchParamsTy LaunchParams,
+                                        __tgt_async_info *AsyncInfo) const {
+  // Libomptarget can pass negative NumTeams and ThreadLimit now after
+  // introducing __tgt_target_kernel. This happens only when we have valid
+  // LoopDesc and the region is not a teams region.
+
+  auto zeKernel = getZeKernel();
+  auto DeviceId = l0Device.getDeviceId();
+  int32_t NumArgs = KernelArgs.NumArgs;
+  int32_t NumTeams = KernelArgs.NumTeams[0];
+  int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+  void *LoopDesc = nullptr;
+
+  if (NumTeams < 0)
+    NumTeams = 0;
+  if (ThreadLimit < 0)
+    ThreadLimit = 0;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+  auto &Plugin = l0Device.getPlugin();
+  auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+  auto *IdStr = Device.getZeIdCStr();
+  auto &Options = LevelZeroPluginTy::getOptions();
+  bool IsAsync = AsyncInfo && Device.asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto *AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+  // We need to get a non-const version of the Properties structure in order to
+  // use its lock and be able to cache the group params and indirect flags
+  auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+  // Protect from kernel preparation to submission as kernels are shared.
+  std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+  // Decide group sizes and counts
+  uint32_t GroupSizes[3];
+  ze_group_count_t GroupCounts;
+
+  bool AllowCooperative = false;
+
+  // Check if we can reuse previous group parameters
+  bool GroupParamsReused = KernelPR.reuseGroupParams(
+      static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+      GroupSizes, GroupCounts, AllowCooperative);
+
+  if (!GroupParamsReused) {
+    auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+                             GroupCounts, LoopDesc, AllowCooperative);
+
+    if (RC != OFFLOAD_SUCCESS) {
+      return RC;
+    }
+
+    KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+                              NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+                              AllowCooperative);
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+       GroupSizes[1], GroupSizes[2]);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+       GroupCounts.groupCountX, GroupCounts.groupCountY,
+       GroupCounts.groupCountZ);
+  for (int32_t I = 0; I < NumArgs; I++) {
+    {
+      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+      CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+                       Arg == nullptr ? nullptr : &Arg);
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+           ") was set successfully for device %s.\n",
+           I, DPxPTR(Arg), IdStr);
+    }
+  }
+
+  // Set Kernel Indirect flags
+  auto &PrevFlags = KernelPR.IndirectAccessFlags;
+  ze_kernel_indirect_access_flags_t Flags = 0;
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+  if (PrevFlags != Flags) {
+    // Combine with common access flags
+    const auto FinalFlags = Device.getIndirectFlags() | Flags;
+    CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+    PrevFlags = Flags;
+  }
+
+  if (!GroupParamsReused) {
+    CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+                     GroupSizes[1], GroupSizes[2]);
+  }
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  const bool UseImmCmdList = Device.useImmForCompute();
+
+  if (UseImmCmdList) {
+    CmdList = Device.getImmCmdList();
+    // Command queue is not used with immediate command list
+  } else {
+    CmdList = Device.getCmdList();
+    CmdQueue = Device.getCmdQueue();
+  }
+
+  if (UseImmCmdList) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Using immediate command list for kernel submission.\n");
+    auto Event = Device.getEvent();
+    size_t NumWaitEvents = 0;
+    ze_event_handle_t *WaitEvents = nullptr;
+    if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+      if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+        NumWaitEvents = 1;
+        WaitEvents = &AsyncQueue->WaitEvents.back();
+      } else {
+        NumWaitEvents = AsyncQueue->WaitEvents.size();
+        WaitEvents = AsyncQueue->WaitEvents.data();
+      }
+    }
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, NumWaitEvents,
+                       WaitEvents);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, NumWaitEvents, WaitEvents);
+    KernelLock.unlock();
+    if (IsAsync) {
+      AsyncQueue->WaitEvents.push_back(Event);
+      AsyncQueue->KernelEvent = Event;
+    } else {
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+      Device.releaseEvent(Event);
+    }
+  } else {
+    ze_event_handle_t Event = nullptr;
+    KernelLock.unlock();
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    if (Event) {
+      Device.releaseEvent(Event);
+    }
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
new file mode 100644
index 0000000000000..790acdd9f568f
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -0,0 +1,637 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+  if (isFull())
+    return nullptr;
+  if (FreeSlot != UINT32_MAX) {
+    const uint32_t Slot = FreeSlot;
+    FreeSlot = UINT32_MAX;
+    UsedSlots[Slot] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+  }
+  for (uint32_t I = 0; I < NumSlots; I++) {
+    if (UsedSlots[I])
+      continue;
+    UsedSlots[I] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + I * ChunkSize);
+  }
+  // Should not reach here.
+  assert(0 && "Inconsistent memory pool state");
+  return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+  if (!contains(Mem))
+    assert(0 && "Inconsistent memory pool state");
+  const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+  UsedSlots[Slot] = false;
+  NumUsedSlots--;
+  FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = Kind;
+  Allocator = _Allocator;
+
+  // Read user-defined options
+  const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+  const size_t UserAllocMax = UserOptions[0];
+  const size_t UserCapacity = UserOptions[1];
+  const size_t UserPoolSize = UserOptions[2];
+
+  BlockCapacity = UserCapacity;
+  PoolSizeMax = UserPoolSize << 20; // MB to B
+  PoolSize = 0;
+
+  auto Context = Allocator->L0Context->getZeContext();
+  const auto Device = Allocator->Device;
+
+  // Check page size used for this allocation kind to decide minimum
+  // allocation size when allocating from L0.
+  void *Mem = Allocator->allocL0(8, 0, AllocKind);
+  ze_memory_allocation_properties_t AP{
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+      ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+  CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+  AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+  CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+  bool IsDiscrete = false;
+  if (Device) {
+    ze_device_properties_t Properties{};
+    Properties.deviceId = 0;
+    Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    Properties.pNext = nullptr;
+    CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+    IsDiscrete = Device->isDiscreteDevice();
+
+    if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+      // Use page size as minimum chunk size for USM shared on discrete
+      // device.
+      // FIXME: pageSize is not returned correctly (=0) on some new devices,
+      //        so use fallback value for now.
+      AllocMin = (std::max)(AP.pageSize, AllocUnit);
+      AllocUnit = AllocMin * BlockCapacity;
+    }
+  }
+
+  // Convert MB to B and round up to power of 2
+  AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+  if (AllocMin >= AllocMax) {
+    AllocMax = 2 * AllocMin;
+    DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+       "requirements.\n",
+       AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+  }
+  assert(AllocMin < AllocMax &&
+         "Invalid parameters while initializing memory pool");
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+
+  // Set bucket parameters
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    size_t BlockSize = ChunkSize * BlockCapacity;
+    // On discrete device, the cost of native L0 invocation doubles when the
+    // the requested size doubles after certain threshold, so allocating
+    // larger block does not pay off at all. It is better to keep a single
+    // chunk in a single block in such cases.
+    if (BlockSize <= AllocUnit) {
+      BlockSize = AllocUnit; // Allocation unit is already large enough
+    } else if (IsDiscrete) {
+      // Do not preallocate if it does not pay off
+      if (ChunkSize >= L0UsmPreAllocThreshold ||
+          (AllocKind == TARGET_ALLOC_HOST &&
+           ChunkSize >= L0HostUsmPreAllocThreshold))
+        BlockSize = ChunkSize;
+    }
+    BucketParams.emplace_back(ChunkSize, BlockSize);
+  }
+
+  DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+     "AllocMax = %zu, "
+     "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+     ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+     BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMin = AllocUnit = 1024 << 6; // 64KB
+  AllocMax = Option.ReductionPoolInfo[0] << 20;
+  BlockCapacity = Option.ReductionPoolInfo[1];
+  PoolSize = 0;
+  PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+  }
+
+  DP("Initialized reduction scratch pool for device " DPxMOD
+     ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMax = AllocMin;
+  BlockCapacity = AllocUnit / AllocMax;
+  PoolSize = 0;
+  PoolSizeMax = (1 << 20); // this should be sufficiently large
+  Buckets.resize(1);
+  BucketStats.resize(1, {0, 0});
+  BucketParams.emplace_back(AllocMax, AllocUnit);
+  ZeroInit = true;
+  ZeroInitValue.resize(AllocUnit, 0);
+  DP("Initialized zero-initialized reduction counter pool for "
+     "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+  auto PrintNum = [](uint64_t Num) {
+    if (Num > 1e9)
+      fprintf(stderr, "%11.2e", float(Num));
+    else
+      fprintf(stderr, "%11" PRIu64, Num);
+  };
+
+  bool HasPoolAlloc = false;
+  for (auto &Stat : BucketStats) {
+    if (Stat.first > 0 || Stat.second > 0) {
+      HasPoolAlloc = true;
+      break;
+    }
+  }
+
+  DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+     DPxPTR(Allocator->Device));
+
+  if (HasPoolAlloc) {
+    DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+       AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+    DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+    for (size_t I = 0; I < Buckets.size(); I++) {
+      const auto &Stat = BucketStats[I];
+      if (Stat.first > 0 || Stat.second > 0) {
+        DP("-- Bucket[%10zu]:", BucketParams[I].first);
+        PrintNum(Stat.first);
+        PrintNum(Stat.second);
+        fprintf(stderr, "%11.2f\n",
+                float(Stat.second) / float(Stat.first + Stat.second) * 100);
+      }
+    }
+  } else {
+    DP("-- Not used\n");
+  }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+  const int DebugLevel = getDebugLevel();
+  if (DebugLevel > 0)
+    printUsage();
+  for (auto &Bucket : Buckets) {
+    for (auto *Block : Bucket) {
+      if (DebugLevel > 0)
+        Allocator->log(0, Block->Size, AllocKind);
+      CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+                       reinterpret_cast<void *>(Block->Base));
+      delete Block;
+    }
+  }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+  if (Size == 0 || Size > AllocMax)
+    return nullptr;
+
+  const uint32_t BucketId = getBucketId(Size);
+  auto &Blocks = Buckets[BucketId];
+  void *Mem = nullptr;
+
+  for (auto *Block : Blocks) {
+    if (Block->isFull())
+      continue;
+    Mem = Block->alloc();
+    assert(Mem && "Inconsistent state while allocating memory from pool");
+    PtrToBlock.emplace(Mem, Block);
+    break;
+  }
+
+  if (Mem == nullptr) {
+    const bool IsSmallAllocatable =
+        (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+    const bool IsFull = (PoolSize > PoolSizeMax);
+    if (IsFull && !IsSmallAllocatable)
+      return nullptr;
+    // Bucket is empty or all blocks in the bucket are full
+    const auto ChunkSize = BucketParams[BucketId].first;
+    const auto BlockSize = BucketParams[BucketId].second;
+    void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+    if (ZeroInit) {
+      auto RC =
+          Allocator->enqueueMemCopy(Base, ZeroInitValue.data(), BlockSize);
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Failed to zero-initialize pool memory\n");
+        return nullptr;
+      }
+    }
+
+    BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
+    Blocks.push_back(Block);
+    Mem = Block->alloc();
+    PtrToBlock.emplace(Mem, Block);
+    if (IsFull)
+      SmallPoolSize += BlockSize;
+    else
+      PoolSize += BlockSize;
+    DP("New block allocation for %s pool: base = " DPxMOD
+       ", size = %zu, pool size = %zu\n",
+       ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+    BucketStats[BucketId].first++;
+  } else {
+    BucketStats[BucketId].second++;
+  }
+
+  AllocSize = (AllocMin << BucketId);
+
+  return Mem;
+}
+
+/// Deallocate the specified memory and returns block size deallocated.
+size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
+  if (PtrToBlock.count(Ptr) == 0)
+    return 0;
+  PtrToBlock[Ptr]->dealloc(Ptr);
+  const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize;
+  PtrToBlock.erase(Ptr);
+  return Deallocated;
+}
+
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
+                                            int32_t Kind, bool InPool,
+                                            bool ImplicitArg) {
+  const auto Inserted =
+      Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+  // Check if we keep valid disjoint memory ranges.
+  [[maybe_unused]] bool Valid = Inserted.second;
+  if (Valid) {
+    if (Inserted.first != Map.begin()) {
+      const auto I = std::prev(Inserted.first, 1);
+      Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+    }
+    if (Valid) {
+      const auto I = std::next(Inserted.first, 1);
+      if (I != Map.end())
+        Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+    }
+  }
+  assert(Valid && "Invalid overlapping memory allocation");
+  if (ImplicitArg)
+    NumImplicitArgs[Kind]++;
+}
+
+/// Remove allocation information for the given memory location
+bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
+                                               MemAllocInfoTy *Removed) {
+  const auto AllocInfo = Map.find(Ptr);
+  if (AllocInfo == Map.end())
+    return false;
+  if (AllocInfo->second.ImplicitArg)
+    NumImplicitArgs[AllocInfo->second.Kind]--;
+  if (Removed)
+    *Removed = AllocInfo->second;
+  Map.erase(AllocInfo);
+  return true;
+}
+
+void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+                                     const L0OptionsTy &Option) {
+  SupportsLargeMem = L0Device.supportsLargeMem();
+  IsHostMem = false;
+  Device = &L0Device;
+  L0Context = &L0Device.getL0Context();
+  for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
+    if (Option.MemPoolInfo.count(Kind) > 0) {
+      std::lock_guard<std::mutex> Lock(Mtx);
+      Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::forward_as_tuple(Kind, this, Option));
+    }
+    if (getDebugLevel() > 0)
+      Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::tuple<>{});
+  }
+  ReductionPool = std::make_unique<MemPoolTy>(this, Option);
+  CounterPool = std::make_unique<MemPoolTy>(this);
+  updateMaxAllocSize(L0Device);
+}
+
+void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+                                  const L0OptionsTy &Option) {
+  SupportsLargeMem = Driver.supportsLargeMem();
+  IsHostMem = true;
+  this->L0Context = &Driver;
+  if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    Pools.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST),
+                  std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+  }
+  if (getDebugLevel() > 0)
+    Stats.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+}
+
+void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
+  // Update the maximum allocation size for this Allocator
+  ze_device_properties_t P;
+  P.maxMemAllocSize = 0;
+  P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  P.pNext = nullptr;
+  CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+
+  if (IsHostMem) {
+    // MaxAllocSize should be the minimum of all devices from the driver
+    if (MaxAllocSize > P.maxMemAllocSize) {
+      MaxAllocSize = P.maxMemAllocSize;
+      DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
+         DPxPTR(L0Context), MaxAllocSize);
+    }
+    return;
+  }
+
+  MaxAllocSize = P.maxMemAllocSize;
+  DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
+     MaxAllocSize);
+}
+
+/// Release resources and report statistics if requested
+void MemAllocatorTy::deinit() {
+  std::lock_guard<std::mutex> Lock(Mtx);
+  // Release RTL-owned memory
+  for (auto *M : MemOwned)
+    dealloc_locked(M);
+  // Release resources used in the pool
+  Pools.clear();
+  ReductionPool.reset(nullptr);
+  CounterPool.reset(nullptr);
+  // Report memory usage if requested
+  if (getDebugLevel() > 0) {
+    for (auto &Stat : Stats) {
+      DP("Memory usage for %s, device " DPxMOD "\n",
+         ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+      const auto &ST = Stat.second;
+      if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+        DP("-- Not used\n");
+        continue;
+      }
+      DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
+      DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
+      DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
+      DP("-- Freed    : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
+      DP("-- InUse    : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
+      DP("-- PeakUse  : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
+      DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+    }
+  }
+
+  // mark as deinitialized
+  L0Context = nullptr;
+}
+
+/// Allocate memory with the specified information
+void *MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+                            intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                            uint32_t MemAdvice, AllocOptionTy AllocOpt) {
+  assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
+          Kind == TARGET_ALLOC_SHARED) &&
+         "Unknown memory kind while allocating target memory");
+
+  std::lock_guard<std::mutex> Lock(Mtx);
+
+  // We do not expect meaningful Align parameter when Offset > 0, so the
+  // following code does not handle such case.
+
+  size_t AllocSize = Size + Offset;
+  void *Mem = nullptr;
+  void *AllocBase = nullptr;
+  const bool UseScratchPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH);
+  const bool UseZeroInitPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
+
+  if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+    // Pool is enabled for the allocation kind, and we do not use any memory
+    // advice. We should avoid using pool if there is any meaningful memory
+    // advice not to affect sibling allocation in the same block.
+    if (Align > 0)
+      AllocSize += (Align - 1);
+    size_t PoolAllocSize = 0;
+    if (UseScratchPool)
+      AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+    else if (UseZeroInitPool)
+      AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+    else
+      AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+    if (AllocBase) {
+      uintptr_t Base = (uintptr_t)AllocBase;
+      if (Align > 0)
+        Base = (Base + Align) & ~(Align - 1);
+      Mem = (void *)(Base + Offset);
+      AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+      log(Size, PoolAllocSize, Kind, true /* Pool */);
+      if (DevMalloc)
+        MemOwned.push_back(AllocBase);
+      if (UseDedicatedPool) {
+        DP("Allocated %zu bytes from %s pool\n", Size,
+           UseScratchPool ? "scratch" : "zero-initialized");
+      }
+      return Mem;
+    }
+  }
+
+  AllocBase = allocL0(AllocSize, Align, Kind, Size);
+  if (AllocBase) {
+    Mem = (void *)((uintptr_t)AllocBase + Offset);
+    AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+    if (DevMalloc)
+      MemOwned.push_back(AllocBase);
+    if (UseDedicatedPool) {
+      // We do not want this happen in general.
+      DP("Allocated %zu bytes from L0 for %s pool\n", Size,
+         UseScratchPool ? "scratch" : "zero-initialized");
+    }
+  }
+  return Mem;
+}
+
+/// Deallocate memory
+int32_t MemAllocatorTy::dealloc_locked(void *Ptr) {
+  MemAllocInfoTy Info;
+  if (!AllocInfo.remove(Ptr, &Info)) {
+    DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
+       DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  if (Info.InPool) {
+    size_t DeallocSize = 0;
+    if (Pools.count(Info.Kind) > 0)
+      DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+    if (DeallocSize == 0) {
+      // Try reduction scratch pool
+      DeallocSize = ReductionPool->dealloc(Info.Base);
+      // Try reduction counter pool
+      if (DeallocSize == 0)
+        DeallocSize = CounterPool->dealloc(Info.Base);
+      if (DeallocSize == 0) {
+        DP("Error: Cannot return memory " DPxMOD " to pool\n", DPxPTR(Ptr));
+        return OFFLOAD_FAIL;
+      }
+    }
+    log(0, DeallocSize, Info.Kind, true /* Pool */);
+    return OFFLOAD_SUCCESS;
+  }
+  if (!Info.Base) {
+    DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
+    return OFFLOAD_FAIL;
+  }
+  CALL_ZE_RET_FAIL(zeMemFree, L0Context->getZeContext(), Info.Base);
+  log(0, Info.Size, Info.Kind);
+
+  DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
+     DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
+                                       size_t Size) {
+  return Device->enqueueMemCopy(Dst, Src, Size);
+}
+
+void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
+                              size_t ActiveSize) {
+  void *Mem = nullptr;
+  ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+                                        nullptr, 0, 0};
+  ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                    nullptr, 0};
+
+  // Use relaxed allocation limit if driver supports
+  ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{
+      ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr,
+      ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE};
+  if (Size > MaxAllocSize && SupportsLargeMem) {
+    DeviceDesc.pNext = &RelaxedDesc;
+    HostDesc.pNext = &RelaxedDesc;
+  }
+
+  auto zeDevice = Device ? Device->getZeDevice() : 0;
+  auto zeContext = L0Context->getZeContext();
+  bool makeResident = false;
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    makeResident = true;
+    CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+                     zeDevice, &Mem);
+    DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_HOST:
+    CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+    DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_SHARED:
+    CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+                     Align, zeDevice, &Mem);
+    DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  default:
+    assert(0 && "Invalid target data allocation kind");
+  }
+
+  size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+  log(LoggedSize, LoggedSize, Kind);
+  if (makeResident) {
+    assert(Device &&
+           "Device is not set for memory allocation. Is this a Device Pool?");
+    if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+      Mem = nullptr;
+  }
+  return Mem;
+}
+
+ze_event_handle_t EventPoolTy::getEvent() {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+
+  if (Events.empty()) {
+    // Need to create a new L0 pool
+    ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0};
+    Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
+    Desc.count = PoolSize;
+    ze_event_pool_handle_t Pool;
+    CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+    Pools.push_back(Pool);
+
+    // Create events
+    ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
+    EventDesc.wait = 0;
+    EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    for (uint32_t I = 0; I < PoolSize; I++) {
+      EventDesc.index = I;
+      ze_event_handle_t Event;
+      CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+      Events.push_back(Event);
+    }
+  }
+
+  auto Ret = Events.back();
+  Events.pop_back();
+
+  return Ret;
+}
+
+/// Return an event to the pool
+void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+  CALL_ZE_RET_VOID(zeEventHostReset, Event);
+  Events.push_back(Event);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
new file mode 100644
index 0000000000000..3acb2e78927e7
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -0,0 +1,371 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget.h"
+
+#include "L0Defs.h"
+#include "L0Options.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Is the given RootID, SubID, CcsID specified in ONEAPI_DEVICE_SELECTOR
+bool L0OptionsTy::shouldAddDevice(int32_t RootID, int32_t SubID,
+                                  int32_t CCSID) const {
+  if (ExplicitRootDevices.empty())
+    return false;
+  for (const auto &RootDev : ExplicitRootDevices) {
+    const auto ErootID = std::get<1>(RootDev);
+    if (ErootID != -2 && RootID != ErootID)
+      continue;
+    const auto EsubID = std::get<2>(RootDev);
+    if (((EsubID != -2) || (SubID == -1)) && (EsubID != SubID))
+      continue;
+    const auto ECCSID = std::get<3>(RootDev);
+    if (((ECCSID != -2) || (CCSID == -1)) && (ECCSID != CCSID))
+      continue;
+    // Check if isDiscard
+    if (!std::get<0>(RootDev))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+/// Read environment variables
+void L0OptionsTy::processEnvironmentVars() {
+  // Compilation options for IGC
+  UserCompilationOptions +=
+      std::string(" ") +
+      StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
+
+  // Explicit Device mode if ONEAPI_DEVICE_SELECTOR is set
+  const StringEnvar DeviceSelectorVar("ONEAPI_DEVICE_SELECTOR", "");
+  if (DeviceSelectorVar.isPresent()) {
+    std::string EnvStr(std::move(DeviceSelectorVar.get()));
+    uint32_t numDiscard = 0;
+    std::transform(EnvStr.begin(), EnvStr.end(), EnvStr.begin(),
+                   [](unsigned char C) { return std::tolower(C); });
+
+    std::vector<std::string_view> Entries = tokenize(EnvStr, ";", true);
+    for (const auto &Term : Entries) {
+      bool isDiscard = false;
+      std::vector<std::string_view> Pair = tokenize(Term, ":", true);
+      if (Pair.empty()) {
+        FAILURE_MESSAGE(
+            "Incomplete selector! Pair and device must be specified.\n");
+      } else if (Pair.size() == 1) {
+        FAILURE_MESSAGE("Incomplete selector!  Try '%s:*'if all devices "
+                        "under the Pair was original intention.\n",
+                        Pair[0].data());
+      } else if (Pair.size() > 2) {
+        FAILURE_MESSAGE(
+            "Error parsing selector string \"%s\" Too many colons (:)\n",
+            Term.data());
+      }
+      if (!((Pair[0][0] == '*') ||
+            (!strncmp(Pair[0].data(), "level_zero", Pair[0].length())) ||
+            (!strncmp(Pair[0].data(), "!level_zero", Pair[0].length()))))
+        break;
+      isDiscard = Pair[0][0] == '!';
+      if (isDiscard)
+        numDiscard++;
+      else if (numDiscard > 0)
+        FAILURE_MESSAGE("All negative(discarding) filters must appear after "
+                        "all positive(accepting) filters!");
+
+      std::vector<std::string_view> Targets = tokenize(Pair[1], ",", true);
+      for (const auto &TargetStr : Targets) {
+        bool HasDeviceWildCard = false;
+        bool HasSubDeviceWildCard = false;
+        bool DeviceNum = false;
+        std::vector<std::string_view> DeviceSubTuple =
+            tokenize(TargetStr, ".", true);
+        int32_t RootD[3] = {-1, -1, -1};
+        if (DeviceSubTuple.empty()) {
+          FAILURE_MESSAGE(
+              "ONEAPI_DEVICE_SELECTOR parsing error. Device must be "
+              "specified.");
+        }
+
+        std::string_view TopDeviceStr = DeviceSubTuple[0];
+        static const std::array<std::string, 7> DeviceStr = {
+            "host", "cpu", "gpu", "acc", "fpga", "*"};
+        auto It =
+            find_if(DeviceStr.begin(), DeviceStr.end(),
+                    [&](auto DeviceStr) { return TopDeviceStr == DeviceStr; });
+        if (It != DeviceStr.end()) {
+          if (TopDeviceStr[0] == '*') {
+            HasDeviceWildCard = true;
+            RootD[0] = -2;
+          } else if (!strncmp(DeviceSubTuple[0].data(), "gpu", 3))
+            continue;
+        } else {
+          std::string TDS(TopDeviceStr);
+          if (!isDigits(TDS)) {
+            FAILURE_MESSAGE("error parsing device number: %s",
+                            DeviceSubTuple[0].data());
+          } else {
+            RootD[0] = std::stoi(TDS);
+            DeviceNum = true;
+          }
+        }
+        if (DeviceSubTuple.size() >= 2) {
+          if (!DeviceNum && !HasDeviceWildCard)
+            FAILURE_MESSAGE("sub-devices can only be requested when parent "
+                            "device is specified by number or wildcard, not a "
+                            "device type like \'gpu\'");
+          std::string_view SubDeviceStr = DeviceSubTuple[1];
+          if (SubDeviceStr[0] == '*') {
+            HasSubDeviceWildCard = true;
+            RootD[1] = -2;
+          } else {
+            if (HasDeviceWildCard) // subdevice is a number and device is a *
+              FAILURE_MESSAGE(
+                  "sub-device can't be requested by number if parent "
+                  "device is specified by a wildcard.");
+
+            std::string SDS(SubDeviceStr);
+            if (!isDigits(SDS)) {
+              FAILURE_MESSAGE("error parsing subdevice index: %s",
+                              DeviceSubTuple[1].data());
+            } else
+              RootD[1] = std::stoi(SDS);
+          }
+        }
+        if (DeviceSubTuple.size() == 3) {
+          std::string_view SubSubDeviceStr = DeviceSubTuple[2];
+          if (SubSubDeviceStr[0] == '*') {
+            RootD[2] = -2;
+          } else {
+            if (HasSubDeviceWildCard)
+              FAILURE_MESSAGE("sub-sub-device can't be requested by number if "
+                              "sub-device before is specified by a wildcard.");
+            std::string SSDS(SubSubDeviceStr);
+            if (!isDigits(SSDS)) {
+              FAILURE_MESSAGE("error parsing sub-sub-device index: %s",
+                              DeviceSubTuple[2].data());
+            } else
+              RootD[2] = std::stoi(SSDS);
+          }
+        } else if (DeviceSubTuple.size() > 3) {
+          FAILURE_MESSAGE("error parsing %s Only two levels of sub-devices "
+                          "supported at this time ",
+                          TargetStr.data());
+        }
+        if (isDiscard)
+          ExplicitRootDevices.insert(
+              ExplicitRootDevices.begin(),
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+        else
+          ExplicitRootDevices.push_back(
+              std::tuple<bool, int32_t, int32_t, int32_t>(!isDiscard, RootD[0],
+                                                          RootD[1], RootD[2]));
+      }
+    }
+  }
+
+  DP("ONEAPI_DEVICE_SELECTOR specified %zu root devices\n",
+     ExplicitRootDevices.size());
+  DP("  (Accept/Discard [T/F] DeviceID[.SubID[.CCSID]]) -2(all), "
+     "-1(ignore)\n");
+  for (auto &T : ExplicitRootDevices) {
+    DP(" %c %d.%d.%d\n", (std::get<0>(T) == true) ? 'T' : 'F', std::get<1>(T),
+       std::get<2>(T), std::get<3>(T));
+    (void)T; // silence warning
+  }
+
+  // Memory pool
+  // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
+  //  <Option>       := 0 | <PoolInfoList>
+  //  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]
+  //  <PoolInfo>     := <MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]
+  //  <MemType>      := all | device | host | shared
+  //  <AllocMax>     := non-negative integer or empty, max allocation size in
+  //                    MB (default: 1)
+  //  <Capacity>     := positive integer or empty, number of allocations from
+  //                    a single block (default: 4)
+  //  <PoolSize>     := positive integer or empty, max pool size in MB
+  //                    (default: 256)
+  const StringEnvar MemoryPoolVar("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL", "");
+  if (MemoryPoolVar.isPresent()) {
+    if (MemoryPoolVar.get() == "0") {
+      Flags.UseMemoryPool = 0;
+      MemPoolInfo.clear();
+    } else {
+      std::istringstream Str(MemoryPoolVar.get());
+      int32_t MemType = -1;
+      int32_t Offset = 0;
+      int32_t Valid = 1;
+      const std::array<int32_t, 3> DefaultValue{1, 4, 256};
+      const int32_t AllMemType = INT32_MAX;
+      std::array<int32_t, 3> AllInfo{1, 4, 256};
+      std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+      for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
+        if (Token == "device") {
+          MemType = TARGET_ALLOC_DEVICE;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "host") {
+          MemType = TARGET_ALLOC_HOST;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "shared") {
+          MemType = TARGET_ALLOC_SHARED;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "all") {
+          MemType = AllMemType;
+          Offset = 0;
+          Valid = 2;
+        } else if (Offset < 3 && MemType >= 0) {
+          int32_t Num = std::atoi(Token.c_str());
+          bool ValidNum = (Num >= 0 && Offset == 0) || (Num > 0 && Offset > 0);
+          if (ValidNum && MemType == AllMemType)
+            AllInfo[Offset++] = Num;
+          else if (ValidNum)
+            PoolInfo[MemType][Offset++] = Num;
+          else if (Token.size() == 0)
+            Offset++;
+          else
+            Valid = 0;
+        } else {
+          Valid = 0;
+        }
+      }
+      if (Valid > 0) {
+        if (Valid == 2) {
+          // "all" is specified -- ignore other inputs
+          if (AllInfo[0] > 0) {
+            MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+          } else {
+            MemPoolInfo.clear();
+          }
+        } else {
+          // Use user-specified configuration
+          for (auto &I : PoolInfo) {
+            if (I.second[0] > 0)
+              MemPoolInfo[I.first] = I.second;
+            else
+              MemPoolInfo.erase(I.first);
+          }
+        }
+      } else {
+        DP("Ignoring incorrect memory pool configuration "
+           "LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=%s\n",
+           MemoryPoolVar.get().c_str());
+        DP("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>\n");
+        DP("  <Option>       := 0 | <PoolInfoList>\n");
+        DP("  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]\n");
+        DP("  <PoolInfo>     := "
+           "<MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]\n");
+        DP("  <MemType>      := all | device | host | shared\n");
+        DP("  <AllocMax>     := non-negative integer or empty, "
+           "max allocation size in MB (default: 1)\n");
+        DP("  <Capacity>     := positive integer or empty, "
+           "number of allocations from a single block (default: 4)\n");
+        DP("  <PoolSize>     := positive integer or empty, "
+           "max pool size in MB (default: 256)\n");
+      }
+    }
+  }
+
+  if (StringEnvar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS").isPresent()) {
+    // To match SYCL RT behavior, we just need to check whether
+    // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+    // does not matter.
+    CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE=<SizeInKB>
+  const Envar<size_t> StagingBufferSizeVar(
+      "LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE");
+  if (StagingBufferSizeVar.isPresent()) {
+    size_t SizeInKB = StagingBufferSizeVar;
+    if (SizeInKB > (16 << 10)) {
+      SizeInKB = (16 << 10);
+      DP("Staging buffer size is capped at %zu KB\n", SizeInKB);
+    }
+    StagingBufferSize = SizeInKB << 10;
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE=<Fmt>
+  // <Fmt> := sync | async | async_ordered
+  // sync: perform synchronization after each command
+  // async: perform synchronization when it is required
+  // async_ordered: same as "async", but command is ordered
+  // This option is ignored unless IMM is fully enabled on compute and copy.
+  // On Intel PVC GPU, when used with immediate command lists over Level Zero
+  // backend, a target region may involve multiple command submissions to the
+  // L0 copy queue and compute queue. L0 events are used for each submission
+  // (data transfer of a single item or kernel execution). When "async" is
+  // specified, a) each data transfer to device is submitted with an event.
+  // b) The kernel is submitted next with a dependence on all the previous
+  // data transfer events. The kernel also has an event associated with it.
+  // c) The data transfer from device will be submitted with a dependence on
+  // the kernel event. d) Finally wait on the host for all the events
+  // associated with the data transfer from device.
+  // The env-var also affects any "target update" constructs as well.
+  // The env-var only affects the L0 copy/compute commands issued from a
+  // single target construct execution, not across multiple invocations.
+  const StringEnvar CommandModeVar("LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE");
+  if (CommandModeVar.isPresent()) {
+    if (match(CommandModeVar, "sync"))
+      CommandMode = CommandModeTy::Sync;
+    else if (match(CommandModeVar, "async"))
+      CommandMode = CommandModeTy::Async;
+    else if (match(CommandModeVar, "async_ordered"))
+      CommandMode = CommandModeTy::AsyncOrdered;
+    else
+      INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
+                     CommandModeVar.get().c_str());
+  }
+}
+/// Parse String  and split into tokens of string_views based on the
+/// Delim character.
+std::vector<std::string_view>
+L0OptionsTy::tokenize(const std::string_view &Filter, const std::string &Delim,
+                      bool ProhibitEmptyTokens) {
+  std::vector<std::string_view> Tokens;
+  size_t Pos = 0;
+  size_t LastPos = 0;
+  while ((Pos = Filter.find(Delim, LastPos)) != std::string::npos) {
+    std::string_view Tok(Filter.data() + LastPos, (Pos - LastPos));
+
+    if (!Tok.empty()) {
+      Tokens.push_back(Tok);
+    } else if (ProhibitEmptyTokens) {
+      FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input "
+                      "before '%s'delimiter is not allowed.",
+                      Delim.c_str());
+    }
+    // move the search starting index
+    LastPos = Pos + 1;
+  }
+
+  // Add remainder if any
+  if (LastPos < Filter.size()) {
+    std::string_view Tok(Filter.data() + LastPos, Filter.size() - LastPos);
+    Tokens.push_back(Tok);
+  } else if ((LastPos != 0) && ProhibitEmptyTokens) {
+    // if delimiter is the last sybmol in the string.
+    FAILURE_MESSAGE("ONEAPI_DEVICE_SELECTOR parsing error. Empty input after "
+                    "'%s' delimiter is not allowed.",
+                    Delim.c_str());
+  }
+  return Tokens;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
new file mode 100644
index 0000000000000..51d6595560484
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -0,0 +1,285 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/zes_api.h>
+
+#include "L0Device.h"
+#include "L0Interop.h"
+#include "L0Kernel.h"
+#include "L0Plugin.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+using namespace llvm::omp::target;
+using namespace error;
+
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+// Common data across all possible plugin instantiations
+L0OptionsTy LevelZeroPluginTy::Options;
+
+int32_t LevelZeroPluginTy::findDevices() {
+  CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+  uint32_t NumDrivers = 0;
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+  if (NumDrivers == 0) {
+    DP("Cannot find any drivers.\n");
+    return 0;
+  }
+  const bool ExplicitMode = getOptions().ExplicitRootDevices.size() > 0;
+
+  // We expect multiple drivers on Windows to support different device types,
+  // so we need to maintain multiple drivers and contexts in general.
+  llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+
+  struct RootInfoTy {
+    uint32_t OrderId;
+    ze_device_handle_t zeDevice;
+    L0ContextTy *Driver;
+    bool IsDiscrete;
+  };
+  llvm::SmallVector<RootInfoTy> RootDevices;
+
+  uint32_t OrderId = 0;
+  for (uint32_t DriverId = 0; DriverId < NumDrivers; DriverId++) {
+    const auto &Driver = FoundDrivers[DriverId];
+    uint32_t DeviceCount = 0;
+    ze_result_t RC;
+    CALL_ZE(RC, zeDeviceGet, Driver, &DeviceCount, nullptr);
+    if (RC != ZE_RESULT_SUCCESS || DeviceCount == 0) {
+      DP("Cannot find any devices from driver " DPxMOD ".\n", DPxPTR(Driver));
+      continue;
+    }
+    // We have a driver that supports at least one device
+    ContextList.emplace_back(*this, Driver, DriverId);
+    auto &DrvInfo = ContextList.back();
+    llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
+    CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+
+    for (auto &zeDevice : FoundDevices)
+      RootDevices.push_back(
+          {OrderId++, zeDevice, &DrvInfo, L0DeviceTy::isDiscrete(zeDevice)});
+  }
+
+  // move discrete devices to the front
+  std::sort(RootDevices.begin(), RootDevices.end(),
+            [](const RootInfoTy &A, const RootInfoTy &B) {
+              // if both are discrete, order by OrderId
+              // if both are not discrete, order by OrderId
+              // Otherwise, discrete goes first
+
+              if (A.IsDiscrete && B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              if (!A.IsDiscrete && !B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              return A.IsDiscrete;
+            });
+
+  struct DeviceInfoTy {
+    L0DeviceIdTy Id;
+    L0ContextTy *Driver;
+    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+  };
+
+  llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
+
+  // helper lambdas
+  auto addDevice = [ExplicitMode,
+                    &DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+                                   int32_t SubId = -1, int32_t CCSId = -1) {
+    if (!ExplicitMode || getOptions().shouldAddDevice(RootId, SubId, CCSId)) {
+      DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
+    }
+  };
+  for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
+    const auto zeDevice = RootDevices[RootId].zeDevice;
+    auto *RootDriver = RootDevices[RootId].Driver;
+    addDevice(zeDevice, RootDriver, RootId);
+  }
+  NumDevices = DevicesToAdd.size();
+  auto DeviceId = 0;
+  for (auto &DeviceInfo : DevicesToAdd) {
+    auto RootId = DeviceInfo.Id.RootId;
+    auto SubId = DeviceInfo.Id.SubId;
+    auto CCSId = DeviceInfo.Id.CCSId;
+    auto zeDevice = DeviceInfo.Id.zeId;
+    auto *Driver = DeviceInfo.Driver;
+
+    std::string IdStr = std::to_string(RootId) +
+                        (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+                        (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+    L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
+                                       zeDevice, *Driver, std::move(IdStr),
+                                       CCSId < 0 ? 0 : CCSId /* ComputeIndex */
+                                       ));
+    DeviceId++;
+  }
+
+  DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
+     getNumRootDevices(), NumDevices);
+  DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
+  for (auto &l0Device : L0Devices) {
+    DP("-- %s\n", l0Device->getZeIdCStr());
+    (void)l0Device; // silence warning
+  }
+
+  if (getDebugLevel() > 0) {
+    DP("Root Device Information\n");
+    for (uint32_t I = 0; I < getNumRootDevices(); I++) {
+      auto &l0Device = getDeviceFromId(I);
+      l0Device.reportDeviceInfo();
+    }
+  }
+
+  return getNumRootDevices();
+}
+
+/// Clean-up routine to be invoked by the destructor or
+/// LevelZeroPluginTy::deinit.
+void LevelZeroPluginTy::closeRTL() {
+
+  ContextTLSTable.clear();
+  DeviceTLSTable.clear();
+  ThreadTLSTable.clear();
+  ContextList.clear();
+
+  DP("Plugin closed successfully\n");
+}
+
+Expected<int32_t> LevelZeroPluginTy::initImpl() {
+  DP("Level0 NG plugin initialization\n");
+  // process options before anything else
+  Options.init();
+  return findDevices();
+}
+
+Error LevelZeroPluginTy::deinitImpl() {
+  DP("Deinit Level0 plugin!\n");
+  closeRTL();
+  return Plugin::success();
+}
+
+GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
+                                                 int32_t DeviceId,
+                                                 int32_t NumDevices) {
+  return &getDeviceFromId(DeviceId);
+}
+
+GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
+  return new L0GlobalHandlerTy();
+}
+
+uint16_t LevelZeroPluginTy::getMagicElfBits() const {
+  // TODO: We need to register a real ELF machine type
+  return 0x8086;
+}
+
+Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
+  return Triple::spirv64;
+}
+
+const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
+
+Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
+  return Plugin::success();
+}
+
+Expected<bool> LevelZeroPluginTy::isELFCompatible(uint32_t DeviceId,
+                                                  StringRef Image) const {
+  uint64_t MajorVer, MinorVer;
+  return isValidOneOmpImage(Image, MajorVer, MinorVer);
+}
+
+Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  // L0 object
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  auto &l0Device = getDeviceFromId(device_id);
+
+  // We can synchronize both L0 & SYCL objects with the same ze command
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with ImmCmdList barrier\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    auto Event = l0Device.getEvent();
+
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, Event, 0,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, UINT64_MAX);
+    l0Device.releaseEvent(Event);
+  } else {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with queue synchronize\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+  }
+
+  return Plugin::success();
+}
+
+Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  if (Interop->attrs.inorder)
+    return Plugin::success();
+
+  auto &l0Device = getDeviceFromId(device_id);
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::async_barrier: Appending ImmCmdList barrier "
+       "to " DPxMOD "\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
+                      nullptr);
+  } else {
+    DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
+       "\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    ze_command_list_handle_t CmdList = l0Device.getCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_level_zero() {
+  return new llvm::omp::target::plugin::LevelZeroPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
new file mode 100644
index 0000000000000..33c19b0e7c50d
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -0,0 +1,625 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // !_WIN32
+
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                                     DeviceImageTy &Image,
+                                                     GlobalTy &DeviceGlobal) {
+  const char *GlobalName = DeviceGlobal.getName().data();
+
+  L0DeviceTy &l0Device = static_cast<L0DeviceTy &>(Device);
+  const L0ProgramTy *Program =
+      l0Device.getProgramFromImage(Image.getTgtImage());
+  void *Addr = Program->getOffloadVarDeviceAddr(GlobalName);
+
+  // Save the pointer to the symbol allowing nullptr.
+  DeviceGlobal.setPtr(Addr);
+
+  if (Addr == nullptr)
+    return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
+                         GlobalName);
+
+  return Plugin::success();
+}
+
+inline L0DeviceTy &L0ProgramTy::getL0Device() const {
+  return L0DeviceTy::makeL0Device(getDevice());
+}
+
+L0ProgramTy::~L0ProgramTy() {
+  for (auto *Kernel : Kernels) {
+    // We need explicit destructor and deallocate calls to release the kernels
+    // created by `GenericDeviceTy::constructKernel()`.
+    Kernel->~L0KernelTy();
+    getL0Device().getPlugin().free(Kernel);
+  }
+  for (auto Module : Modules) {
+    CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+  }
+}
+
+void L0ProgramTy::setLibModule() {
+#if _WIN32
+  return;
+#else
+  const auto *Image = getTgtImage();
+  const size_t NumEntries =
+      static_cast<size_t>(Image->EntriesEnd - Image->EntriesBegin);
+  for (size_t I = 0; I < NumEntries; I++) {
+    const auto &Entry = Image->EntriesBegin[I];
+    // Image contains a kernel, so it is not compiled as a library module
+    if (Entry.SymbolName && Entry.Size == 0)
+      return;
+  }
+  // Check if the image belongs to a dynamic library
+  Dl_info DLI{nullptr};
+  if (dladdr(Image->ImageStart, &DLI) && DLI.dli_fname) {
+    std::vector<uint8_t> FileBin;
+    auto Size = readFile(DLI.dli_fname, FileBin);
+    if (Size) {
+      auto MB = MemoryBuffer::getMemBuffer(
+          StringRef(reinterpret_cast<const char *>(FileBin.data()), Size),
+          /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      auto ELF = ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+      if (ELF) {
+        if (auto *Obj = dyn_cast<ELF64LEObjectFile>((*ELF).get())) {
+          const auto Header = Obj->getELFFile().getHeader();
+          if (Header.e_type == ELF::ET_DYN) {
+            DP("Processing current image as library\n");
+            IsLibModule = true;
+          }
+        }
+      }
+    }
+  }
+#endif // _WIN32
+}
+
+int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+                               const std::string &CommonBuildOptions,
+                               ze_module_format_t Format) {
+  const ze_module_constants_t SpecConstants =
+      LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
+  auto &l0Device = getL0Device();
+  std::string BuildOptions(CommonBuildOptions);
+
+  // Add required flag to enable dynamic linking.
+  if (IsLibModule)
+    BuildOptions += " -library-compilation ";
+
+  ze_module_desc_t ModuleDesc{};
+  ModuleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  ModuleDesc.pNext = nullptr;
+  ModuleDesc.format = Format;
+  ze_module_handle_t Module = nullptr;
+  ze_module_build_log_handle_t BuildLog = nullptr;
+  ze_result_t RC;
+
+  // Build a single module from a single image
+  ModuleDesc.inputSize = Size;
+  ModuleDesc.pInputModule = Image;
+  ModuleDesc.pBuildFlags = BuildOptions.c_str();
+  ModuleDesc.pConstants = &SpecConstants;
+  CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
+             l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
+
+  const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
+
+  if (BuildFailed) {
+    if (IsLibModule)
+      return OFFLOAD_SUCCESS;
+    return OFFLOAD_FAIL;
+  } else {
+    // Check if module link is required. We do not need this check for
+    // library module
+    if (!RequiresModuleLink && !IsLibModule) {
+      ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                           nullptr, 0};
+      CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+      RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+    }
+    // For now, assume the first module contains libraries, globals.
+    if (Modules.empty())
+      GlobalModule = Module;
+    Modules.push_back(Module);
+    l0Device.addGlobalModule(Module);
+    return OFFLOAD_SUCCESS;
+  }
+}
+
+int32_t L0ProgramTy::linkModules() {
+  auto &l0Device = getL0Device();
+  if (!RequiresModuleLink) {
+    DP("Module link is not required\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (Modules.empty()) {
+    DP("Invalid number of modules when linking modules\n");
+    return OFFLOAD_FAIL;
+  }
+
+  ze_result_t RC;
+  ze_module_build_log_handle_t LinkLog = nullptr;
+  CALL_ZE_RC(RC, zeModuleDynamicLink,
+             static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+             l0Device.getGlobalModulesArray(), &LinkLog);
+  const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
+  return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+}
+
+size_t L0ProgramTy::readFile(const char *FileName,
+                             std::vector<uint8_t> &OutFile) const {
+  std::ifstream IFS(FileName, std::ios::binary);
+  if (!IFS.good())
+    return 0;
+  IFS.seekg(0, IFS.end);
+  auto FileSize = static_cast<size_t>(IFS.tellg());
+  OutFile.resize(FileSize);
+  IFS.seekg(0);
+  if (!IFS.read(reinterpret_cast<char *>(OutFile.data()), FileSize)) {
+    OutFile.clear();
+    return 0;
+  }
+  return FileSize;
+}
+
+/// Read SPV from file name
+int32_t L0ProgramTy::readSPVFile(const char *FileName,
+                                 std::vector<uint8_t> &OutSPV) const {
+  // Resolve full path using the location of the plugin
+  std::string FullPath;
+#ifdef _WIN32
+  char RTLPath[_MAX_PATH];
+  HMODULE RTLModule = nullptr;
+  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                          (LPCSTR)&__tgt_target_data_begin_nowait,
+                          &RTLModule)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  if (!GetModuleFileNameA(RTLModule, RTLPath, sizeof(RTLPath))) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLPath;
+#else  // _WIN32
+  Dl_info RTLInfo;
+  if (!dladdr((void *)&__tgt_target_data_begin_nowait, &RTLInfo)) {
+    DP("Error: module creation failed -- cannot resolve full path\n");
+    return OFFLOAD_FAIL;
+  }
+  FullPath = RTLInfo.dli_fname;
+#endif // _WIN32
+  const size_t PathSep = FullPath.find_last_of("/\\");
+  FullPath.replace(PathSep + 1, std::string::npos, FileName);
+  // Read from the full path
+  if (!readFile(FullPath.c_str(), OutSPV)) {
+    DP("Error: module creation failed -- cannot read %s\n", FullPath.c_str());
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                                   std::string &Options) const {
+  // Options that need to be replaced with backend-specific options
+  static const struct {
+    std::string Option;
+    std::string BackendOption;
+  } OptionTranslationTable[] = {
+      {"-ftarget-compile-fast",
+       "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"},
+      {"-foffload-fp32-prec-div", "-ze-fp32-correctly-rounded-divide-sqrt"},
+      {"-foffload-fp32-prec-sqrt", "-ze-fp32-correctly-rounded-divide-sqrt"},
+  };
+
+  for (const auto &OptPair : OptionTranslationTable) {
+    const size_t Pos = Options.find(OptPair.Option);
+    if (Pos != std::string::npos) {
+      Options.replace(Pos, OptPair.Option.length(), OptPair.BackendOption);
+    }
+  }
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  const auto MB = MemoryBuffer::getMemBuffer(Image,
+                                             /*BufferName=*/"",
+                                             /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  if (!ExpectedNewE) {
+    DP("Warning: unable to get ELF handle!\n");
+    return false;
+  }
+  bool Res = false;
+  auto processObjF = [&](const auto ELFObjF) {
+    if (!ELFObjF) {
+      DP("Warning: Unexpected ELF type!\n");
+      return false;
+    }
+    const auto &ELFF = ELFObjF->getELFFile();
+    auto Sections = ELFF.sections();
+    if (!Sections) {
+      DP("Warning: unable to get ELF sections!\n");
+      return false;
+    }
+    bool SeenOffloadSection = false;
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : ELFF.notes(Sec, Err)) {
+        if (Err) {
+          DP("Warning: unable to get ELF notes handle!\n");
+          return false;
+        }
+        if (Note.getName() != "INTELONEOMPOFFLOAD")
+          continue;
+        SeenOffloadSection = true;
+        if (Note.getType() != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+          continue;
+
+        std::string DescStr(std::move(Note.getDescAsStringRef(4).str()));
+        const auto DelimPos = DescStr.find('.');
+        if (DelimPos == std::string::npos) {
+          // The version has to look like "Major#.Minor#".
+          DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n",
+             DescStr.c_str());
+          return false;
+        }
+        const std::string MajorVerStr = DescStr.substr(0, DelimPos);
+        DescStr.erase(0, DelimPos + 1);
+        MajorVer = std::stoull(MajorVerStr);
+        MinorVer = std::stoull(DescStr);
+        return (MajorVer == 1 && MinorVer == 0);
+      }
+    }
+    return SeenOffloadSection;
+  };
+  if (const auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else if (const auto *O =
+                 dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+  return Res;
+}
+
+static StringRef getImageStringRef(const __tgt_device_image *Image) {
+  const char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+  const char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+  const size_t ImgSize = ImgEnd - ImgBegin;
+  return StringRef(ImgBegin, ImgSize);
+}
+
+bool isValidOneOmpImage(const __tgt_device_image *Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  return isValidOneOmpImage(getImageStringRef(Image), MajorVer, MinorVer);
+}
+
+int32_t L0ProgramTy::buildModules(std::string &BuildOptions) {
+  auto &l0Device = getL0Device();
+  auto *Image = getTgtImage();
+  if (identify_magic(getImageStringRef(Image)) == file_magic::spirv_object) {
+    // Handle legacy plain SPIR-V image.
+    uint8_t *ImgBegin = reinterpret_cast<uint8_t *>(Image->ImageStart);
+    uint8_t *ImgEnd = reinterpret_cast<uint8_t *>(Image->ImageEnd);
+    size_t ImgSize = ImgEnd - ImgBegin;
+    return addModule(ImgSize, ImgBegin, BuildOptions,
+                     ZE_MODULE_FORMAT_IL_SPIRV);
+  }
+
+  uint64_t MajorVer, MinorVer;
+  if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+    DP("Warning: image is not a valid oneAPI OpenMP image.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  setLibModule();
+
+  // Iterate over the images and pick the first one that fits.
+  uint64_t ImageCount = 0;
+  struct V1ImageInfo {
+    // 0 - native, 1 - SPIR-V
+    uint64_t Format = (std::numeric_limits<uint64_t>::max)();
+    std::string CompileOpts;
+    std::string LinkOpts;
+    // We may have multiple sections created from split-kernel mode
+    std::vector<const uint8_t *> PartBegin;
+    std::vector<uint64_t> PartSize;
+
+    V1ImageInfo(uint64_t Format, std::string CompileOpts, std::string LinkOpts)
+        : Format(Format), CompileOpts(std::move(CompileOpts)),
+          LinkOpts(std::move(LinkOpts)) {}
+  };
+  std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+  auto MB = MemoryBuffer::getMemBuffer(getImageStringRef(Image),
+                                       /*BufferName=*/"",
+                                       /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  assert(ExpectedNewE &&
+         "isValidOneOmpImage() returns true for invalid ELF image");
+  auto processELF = [&](auto *EObj) {
+    assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
+    assert(MajorVer == 1 && MinorVer == 0 &&
+           "FIXME: update image processing for new oneAPI OpenMP version.");
+    const auto &E = EObj->getELFFile();
+    // Collect auxiliary information.
+    uint64_t MaxImageIdx = 0;
+
+    auto Sections = E.sections();
+    assert(Sections && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid sections.");
+
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : E.notes(Sec, Err)) {
+        assert(!Err && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid notes.");
+        if (Note.getName().str() != "INTELONEOMPOFFLOAD")
+          continue;
+
+        const uint64_t Type = Note.getType();
+        std::string DescStr(std::move(Note.getDescAsStringRef(4)));
+        switch (Type) {
+        default:
+          DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+          ImageCount = std::stoull(DescStr);
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
+          std::vector<std::string> Parts;
+          do {
+            const auto DelimPos = DescStr.find('\0');
+            if (DelimPos == std::string::npos) {
+              Parts.push_back(std::move(DescStr));
+              break;
+            }
+            Parts.push_back(DescStr.substr(0, DelimPos));
+            DescStr.erase(0, DelimPos + 1);
+          } while (Parts.size() < 4);
+
+          // Ignore records with less than 4 strings.
+          if (Parts.size() != 4) {
+            DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+               "record is ignored.\n");
+            continue;
+          }
+
+          const uint64_t Idx = std::stoull(Parts[0]);
+          MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+          if (AuxInfo.find(Idx) != AuxInfo.end()) {
+            DP("Warning: duplicate auxiliary information for image %" PRIu64
+               " is ignored.\n",
+               Idx);
+            continue;
+          }
+          AuxInfo.emplace(
+              std::piecewise_construct, std::forward_as_tuple(Idx),
+              std::forward_as_tuple(std::stoull(Parts[1]), Parts[2], Parts[3]));
+          // Image pointer and size
+          // will be initialized later.
+        }
+        }
+      }
+    }
+
+    if (MaxImageIdx >= ImageCount)
+      DP("Warning: invalid image index found in auxiliary information.\n");
+
+    for (auto Sec : *Sections) {
+      const char *Prefix = "__openmp_offload_spirv_";
+      auto ExpectedSectionName = E.getSectionName(Sec);
+      assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
+                                    "image with invalid section names");
+      std::string SectionName = (*ExpectedSectionName).str();
+      if (SectionName.find(Prefix) != 0)
+        continue;
+      SectionName.erase(0, std::strlen(Prefix));
+
+      // Expected section name in split-kernel mode:
+      // __openmp_offload_spirv_<image_id>_<part_id>
+      auto PartIdLoc = SectionName.find("_");
+      if (PartIdLoc != std::string::npos) {
+        DP("Found a split section in the image\n");
+        // It seems that we do not need part ID as long as they are ordered
+        // in the image and we keep the ordering in the runtime.
+        SectionName.erase(PartIdLoc);
+      } else {
+        DP("Found a single section in the image\n");
+      }
+
+      uint64_t Idx = std::stoull(SectionName);
+      if (Idx >= ImageCount) {
+        DP("Warning: ignoring image section (index %" PRIu64
+           " is out of range).\n",
+           Idx);
+        continue;
+      }
+
+      auto AuxInfoIt = AuxInfo.find(Idx);
+      if (AuxInfoIt == AuxInfo.end()) {
+        DP("Warning: ignoring image section (no aux info).\n");
+        continue;
+      }
+      auto Contents = E.getSectionContents(Sec);
+      assert(Contents);
+      AuxInfoIt->second.PartBegin.push_back((*Contents).data());
+      AuxInfoIt->second.PartSize.push_back(Sec.sh_size);
+    }
+  };
+
+  if (auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else if (auto *O = dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+
+  for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+    const auto It = AuxInfo.find(Idx);
+    if (It == AuxInfo.end()) {
+      DP("Warning: image %" PRIu64
+         " without auxiliary information is ingored.\n",
+         Idx);
+      continue;
+    }
+
+    const auto NumParts = It->second.PartBegin.size();
+    // Split-kernel is not supported in SPIRV format
+    if (NumParts > 1 && It->second.Format != 0) {
+      DP("Warning: split-kernel images are not supported in SPIRV format\n");
+      continue;
+    }
+
+    // Skip unknown image format
+    if (It->second.Format != 0 && It->second.Format != 1) {
+      DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+      continue;
+    }
+
+    const bool IsBinary = (It->second.Format == 0);
+    const auto ModuleFormat =
+        IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
+    std::string Options = BuildOptions;
+    {
+      Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
+      replaceDriverOptsWithBackendOpts(l0Device, Options);
+    }
+
+    for (size_t I = 0; I < NumParts; I++) {
+      const unsigned char *ImgBegin =
+          reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
+      size_t ImgSize = It->second.PartSize[I];
+
+      auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
+
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Error: failed to create program from %s "
+           "(%" PRIu64 "-%zu).\n",
+           IsBinary ? "Binary" : "SPIR-V", Idx, I);
+        return OFFLOAD_FAIL;
+      }
+    }
+
+    DP("Created module from image #%" PRIu64 ".\n", Idx);
+    BuildOptions = std::move(Options);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  return OFFLOAD_FAIL;
+}
+
+void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+  DP("Looking up OpenMP global variable '%s'.\n", CName);
+
+  if (!GlobalModule || !CName)
+    return nullptr;
+
+  std::string Name(CName);
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  for (auto Module : Modules) {
+    CALL_ZE(RC, zeModuleGetGlobalPointer, Module, Name.c_str(), &SizeDummy,
+            &DevicePtr);
+    if (RC == ZE_RESULT_SUCCESS && DevicePtr)
+      return DevicePtr;
+  }
+  DP("Warning: global variable '%s' was not found in the device.\n",
+     Name.c_str());
+  return nullptr;
+}
+
+int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+                                        void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot read from device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
+}
+
+int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+                                         const void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot write to device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
+}
+
+int32_t L0ProgramTy::loadModuleKernels() {
+  // We need to build kernels here before filling the offload entries since we
+  // don't know which module contains a specific kernel with a name.
+  std::unordered_map<std::string, ze_kernel_handle_t> ModuleKernels;
+  for (auto Module : Modules) {
+    uint32_t Count = 0;
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+    if (Count == 0)
+      continue;
+
+    llvm::SmallVector<const char *> Names(Count);
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+
+    for (auto *Name : Names) {
+      KernelsToModuleMap.emplace(Name, Module);
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
new file mode 100644
index 0000000000000..3721d686393bd
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/OmpWrapper.cpp
@@ -0,0 +1,71 @@
+//===--- level_zero/src/OmpWrapper.cpp --------------------------- C++ -*-===//
+//
+// Implement wrapper for OpenMP compatibility through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "L0Defs.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(omp_get_max_teams, 0)
+DLWRAP_INTERNAL(omp_get_teams_thread_limit, 0)
+
+DLWRAP_FINALIZE()
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadOpenMP() {
+  static bool Loaded{false};
+  if (Loaded)
+    return true;
+
+  const char *OpenMPLibrary = "libomp.so";
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", OpenMPLibrary);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(OpenMPLibrary, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", OpenMPLibrary, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, OpenMPLibrary);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+int omp_get_max_teams() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_max_teams();
+}
+
+int omp_get_teams_thread_limit() {
+  if (!loadOpenMP())
+    return 0;
+  return dlwrap_omp_get_teams_thread_limit();
+}