[Mlir-commits] [mlir] [mlir][ExecutionEngine] Add LevelZeroRuntimeWrapper. (PR #151038)

Wed Aug 6 09:09:09 PDT 2025

https://github.com/mshahneo updated https://github.com/llvm/llvm-project/pull/151038

>From 8916f3dfd994f00a959117900b58962ad831e4c9 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Mon, 28 Jul 2025 20:29:55 +0000
Subject: [PATCH 01/12] [mlir][ExecutionEngine] Add LevelZeroRuntimeWrapper.

Adds LevelZeroRuntime wrapper and tests.

Co-authored-by: Artem Kroviakov <artem.kroviakov at intel.com>
Co-authored-by: Nishant Patel <nishant.b.patel at intel.com>
---
 ...lZero.cmake => FindLevelZeroRuntime.cmake} | 113 ++--
 mlir/lib/ExecutionEngine/CMakeLists.txt       | 117 +++--
 .../LevelZeroRuntimeWrappers.cpp              | 491 ++++++++++++++++++
 .../GPU/LEVELZERO/gpu-addf32-to-spirv.mlir    |  56 ++
 .../GPU/LEVELZERO/gpu-addi64-to-spirv.mlir    |  54 ++
 .../LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir |  53 ++
 .../GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir   |  79 +++
 .../Integration/GPU/LEVELZERO/lit.local.cfg   |   2 +
 8 files changed, 878 insertions(+), 87 deletions(-)
 rename mlir/cmake/modules/{FindLevelZero.cmake => FindLevelZeroRuntime.cmake} (66%)
 create mode 100644 mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
 create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
 create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
 create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
 create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
 create mode 100644 mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg

diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
similarity index 66%
rename from mlir/cmake/modules/FindLevelZero.cmake
rename to mlir/cmake/modules/FindLevelZeroRuntime.cmake
index 012187f0afc0b..b1e8e5b6387f2 100644
--- a/mlir/cmake/modules/FindLevelZero.cmake
+++ b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
@@ -20,7 +20,6 @@ include(FindPackageHandleStandardArgs)
 # Search path priority
 # 1. CMake Variable LEVEL_ZERO_DIR
 # 2. Environment Variable LEVEL_ZERO_DIR
-
 if(NOT LEVEL_ZERO_DIR)
     if(DEFINED ENV{LEVEL_ZERO_DIR})
         set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
@@ -28,32 +27,32 @@ if(NOT LEVEL_ZERO_DIR)
 endif()
 
 if(LEVEL_ZERO_DIR)
-    find_path(LevelZero_INCLUDE_DIR
+    find_path(LevelZeroRuntime_INCLUDE_DIR
         NAMES level_zero/ze_api.h
         PATHS ${LEVEL_ZERO_DIR}/include
         NO_DEFAULT_PATH
     )
 
     if(LINUX)
-        find_library(LevelZero_LIBRARY
+        find_library(LevelZeroRuntime_LIBRARY
             NAMES ze_loader
             PATHS ${LEVEL_ZERO_DIR}/lib
-                  ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+            ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
             NO_DEFAULT_PATH
         )
     else()
-        find_library(LevelZero_LIBRARY
+        find_library(LevelZeroRuntime_LIBRARY
             NAMES ze_loader
             PATHS ${LEVEL_ZERO_DIR}/lib
             NO_DEFAULT_PATH
         )
     endif()
 else()
-    find_path(LevelZero_INCLUDE_DIR
+    find_path(LevelZeroRuntime_INCLUDE_DIR
         NAMES level_zero/ze_api.h
     )
 
-    find_library(LevelZero_LIBRARY
+    find_library(LevelZeroRuntime_LIBRARY
         NAMES ze_loader
     )
 endif()
@@ -64,26 +63,33 @@ endif()
 # lists of equal lengths, with the shorter string getting zero-padded.
 function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
     # Convert the strings to list
-    string(REPLACE  "." ";" VL1 ${VERSION_STR1})
-    string(REPLACE  "." ";" VL2 ${VERSION_STR2})
+    string(REPLACE "." ";" VL1 ${VERSION_STR1})
+    string(REPLACE "." ";" VL2 ${VERSION_STR2})
+
     # get lengths of both lists
     list(LENGTH VL1 VL1_LEN)
     list(LENGTH VL2 VL2_LEN)
     set(LEN ${VL1_LEN})
+
     # If they differ in size pad the shorter list with 0s
     if(VL1_LEN GREATER VL2_LEN)
         math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+
         foreach(IDX RANGE 1 ${DIFF} 1)
             list(APPEND VL2 "0")
         endforeach()
     elseif(VL2_LEN GREATER VL2_LEN)
         math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+
         foreach(IDX RANGE 1 ${DIFF} 1)
             list(APPEND VL2 "0")
         endforeach()
+
         set(LEN ${VL2_LEN})
     endif()
+
     math(EXPR LEN_SUB_ONE "${LEN}-1")
+
     foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
         list(GET VL1 ${IDX} VAL1)
         list(GET VL2 ${IDX} VAL2)
@@ -98,12 +104,10 @@ function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
             set(${OUTPUT} TRUE PARENT_SCOPE)
         endif()
     endforeach()
-
-    endfunction(compare_versions)
+endfunction(compare_versions)
 
 # Creates a small function to run and extract the LevelZero loader version.
 function(get_l0_loader_version)
-
     set(L0_VERSIONEER_SRC
         [====[
         #include <iostream>
@@ -142,19 +146,20 @@ function(get_l0_loader_version)
 
     # We need both the directories in the include path as ze_loader.h
     # includes "ze_api.h" and not "level_zero/ze_api.h".
-    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
-    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+    list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
+    list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}/level_zero)
     list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
     try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
-            "${CMAKE_BINARY_DIR}"
-            "${L0_VERSIONEER_FILE}"
-            LINK_LIBRARIES ${LevelZero_LIBRARY}
-            CMAKE_FLAGS
-                "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
-            RUN_OUTPUT_VARIABLE L0_VERSION
+        "${CMAKE_BINARY_DIR}"
+        "${L0_VERSIONEER_FILE}"
+        LINK_LIBRARIES ${LevelZeroRuntime_LIBRARY}
+        CMAKE_FLAGS
+        "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+        RUN_OUTPUT_VARIABLE L0_VERSION
     )
-    if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
-        set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+
+    if(${L0_VERSIONEER_COMPILE} AND(DEFINED L0_VERSIONEER_RUN))
+        set(LevelZeroRuntime_VERSION ${L0_VERSION} PARENT_SCOPE)
         message(STATUS "Found Level Zero of version: ${L0_VERSION}")
     else()
         message(FATAL_ERROR
@@ -163,59 +168,61 @@ function(get_l0_loader_version)
     endif()
 endfunction(get_l0_loader_version)
 
-if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
-    list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
-    list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+if(LevelZeroRuntime_INCLUDE_DIR AND LevelZeroRuntime_LIBRARY)
+    list(APPEND LevelZeroRuntime_LIBRARIES "${LevelZeroRuntime_LIBRARY}")
+    list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
+
     if(OpenCL_FOUND)
-      list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+        list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
     endif()
 
-    cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
-    set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
-
-    if(NOT TARGET LevelZero::LevelZero)
-      add_library(LevelZero::LevelZero INTERFACE IMPORTED)
-      set_target_properties(LevelZero::LevelZero
-        PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
-      )
-      set_target_properties(LevelZero::LevelZero
-        PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
-      )
+    cmake_path(GET LevelZeroRuntime_LIBRARY PARENT_PATH LevelZeroRuntime_LIBRARIES_PATH)
+    set(LevelZeroRuntime_LIBRARIES_DIR ${LevelZeroRuntime_LIBRARIES_PATH})
+
+    if(NOT TARGET LevelZeroRuntime::LevelZeroRuntime)
+        add_library(LevelZeroRuntime::LevelZeroRuntime INTERFACE IMPORTED)
+        set_target_properties(LevelZeroRuntime::LevelZeroRuntime
+            PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZeroRuntime_LIBRARIES}"
+        )
+        set_target_properties(LevelZeroRuntime::LevelZeroRuntime
+            PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZeroRuntime_INCLUDE_DIRS}"
+        )
     endif()
 endif()
 
 # Check if a specific version of Level Zero is required
-if(LevelZero_FIND_VERSION)
+if(LevelZeroRuntime_FIND_VERSION)
     get_l0_loader_version()
     set(VERSION_GT_FIND_VERSION FALSE)
     compare_versions(
-        ${LevelZero_VERSION}
-        ${LevelZero_FIND_VERSION}
+        ${LevelZeroRuntime_VERSION}
+        ${LevelZeroRuntime_FIND_VERSION}
         VERSION_GT_FIND_VERSION
     )
+
     if(${VERSION_GT_FIND_VERSION})
-        set(LevelZero_FOUND TRUE)
+        set(LevelZeroRuntime_FOUND TRUE)
     else()
-        set(LevelZero_FOUND FALSE)
+        set(LevelZeroRuntime_FOUND FALSE)
     endif()
 else()
-    set(LevelZero_FOUND TRUE)
+    set(LevelZeroRuntime_FOUND TRUE)
 endif()
 
-find_package_handle_standard_args(LevelZero
+find_package_handle_standard_args(LevelZeroRuntime
     REQUIRED_VARS
-        LevelZero_FOUND
-        LevelZero_INCLUDE_DIRS
-        LevelZero_LIBRARY
-        LevelZero_LIBRARIES_DIR
+    LevelZeroRuntime_FOUND
+    LevelZeroRuntime_INCLUDE_DIRS
+    LevelZeroRuntime_LIBRARY
+    LevelZeroRuntime_LIBRARIES_DIR
     HANDLE_COMPONENTS
 )
-mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+mark_as_advanced(LevelZeroRuntime_LIBRARY LevelZeroRuntime_INCLUDE_DIRS)
 
-if(LevelZero_FOUND)
-    find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
-        "(found version ${LevelZero_VERSION})"
+if(LevelZeroRuntime_FOUND)
+    find_package_message(LevelZeroRuntime "Found LevelZero: ${LevelZeroRuntime_LIBRARY}"
+        "(found version ${LevelZeroRuntime_VERSION})"
     )
 else()
-    find_package_message(LevelZero "Could not find LevelZero" "")
+    find_package_message(LevelZeroRuntime "Could not find LevelZero" "")
 endif()
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index dd2ac75b88798..06c879f082926 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -14,12 +14,13 @@ set(LLVM_OPTIONAL_SOURCES
   RunnerUtils.cpp
   OptUtils.cpp
   JitRunner.cpp
+  LevelZeroRuntimeWrappers.cpp
   SpirvCpuRuntimeWrappers.cpp
   SyclRuntimeWrappers.cpp
   VulkanRuntimeWrappers.cpp
   VulkanRuntime.cpp
   VulkanRuntime.h
-  )
+)
 
 # Use a separate library for OptUtils, to avoid pulling in the entire JIT and
 # codegen infrastructure. Unlike MLIRExecutionEngine, this is part of
@@ -45,7 +46,7 @@ add_mlir_library(MLIRExecutionEngineUtils
   IPO
   Passes
   TargetParser
-  )
+)
 
 if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
   return()
@@ -53,12 +54,12 @@ endif()
 
 if(LLVM_USE_INTEL_JITEVENTS)
   set(LLVM_JIT_LISTENER_LIB
-      IntelJITEvents)
+    IntelJITEvents)
 endif(LLVM_USE_INTEL_JITEVENTS)
 
 if(LLVM_USE_PERF)
   set(LLVM_JIT_LISTENER_LIB
-      PerfJITEvents)
+    PerfJITEvents)
 endif(LLVM_USE_PERF)
 
 add_mlir_library(MLIRExecutionEngine
@@ -91,7 +92,7 @@ add_mlir_library(MLIRExecutionEngine
   IPO
   Passes
   ${LLVM_JIT_LISTENER_LIB}
-  )
+)
 
 mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
   MLIRBuiltinToLLVMIRTranslation
@@ -100,9 +101,9 @@ mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
   MLIRLLVMToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
   MLIRTargetLLVMIRExport
-  )
+)
 
-if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
+if(LLVM_BUILD_LLVM_DYLIB AND NOT(WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
   # Build a shared library for the execution engine. Some downstream projects
   # use this library to build their own CPU runners while preserving dynamic
   # linkage.
@@ -122,7 +123,7 @@ if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on
     LINK_LIBS PUBLIC
     LLVM
     MLIR
-    )
+  )
 endif()
 
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
@@ -162,7 +163,7 @@ if(LLVM_ENABLE_PIC)
     Float16bits.cpp
 
     EXCLUDE_FROM_LIBMLIR
-    )
+  )
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
@@ -179,7 +180,7 @@ if(LLVM_ENABLE_PIC)
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
-    )
+  )
   set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS)
 
@@ -205,6 +206,7 @@ if(LLVM_ENABLE_PIC)
   )
   set_property(TARGET mlir_async_runtime PROPERTY CXX_VISIBILITY_PRESET hidden)
   target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS)
+
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
     # Don't export symbols from link-time dependencies, these are internal
     # implementation details.
@@ -226,7 +228,8 @@ if(LLVM_ENABLE_PIC)
     # custom error message.
     include(CheckLanguage)
     check_language(CUDA)
-    if (CMAKE_CUDA_COMPILER)
+
+    if(CMAKE_CUDA_COMPILER)
       enable_language(CUDA)
     else()
       message(SEND_ERROR
@@ -290,13 +293,14 @@ if(LLVM_ENABLE_PIC)
 
   if(MLIR_ENABLE_ROCM_RUNNER)
     # Configure ROCm support.
-    if (NOT DEFINED ROCM_PATH)
-      if (NOT DEFINED ENV{ROCM_PATH})
+    if(NOT DEFINED ROCM_PATH)
+      if(NOT DEFINED ENV{ROCM_PATH})
         set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
       else()
         set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
       endif()
     endif()
+
     # A lot of the ROCm CMake files expect to find their own dependencies in
     # CMAKE_PREFIX_PATH and don't respect PATHS or HINTS :( .
     # Therefore, temporarily add the ROCm path to CMAKE_PREFIX_PATH so we can
@@ -306,24 +310,28 @@ if(LLVM_ENABLE_PIC)
     find_package(hip REQUIRED)
     set(CMAKE_PREFIX_PATH "${REAL_CMAKE_PREFIX_PATH}")
 
-    if (NOT DEFINED ROCM_TEST_CHIPSET)
+    if(NOT DEFINED ROCM_TEST_CHIPSET)
       find_program(ROCM_AGENT_ENUMERATOR rocm_agent_enumerator "${ROCM_PATH}/bin" /usr/bin /usr/local/bin)
+
       if(ROCM_AGENT_ENUMERATOR)
-          execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
+        execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
           OUTPUT_VARIABLE AGENTS_STRING
           ERROR_VARIABLE AGENTS_STRING
           RESULT_VARIABLE AGENT_ENUMERATOR_RESULT)
       else()
         message(SEND_ERROR "Could not find rocm_agent_enumerator")
       endif()
-      if (NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
+
+      if(NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
         message(SEND_ERROR "Could not run rocm_agent_enumerator and ROCM_TEST_CHIPSET is not defined")
         set(AGENTS_STRING "")
       endif()
+
       string(STRIP AGENTS_STRING ${AGENTS_STRING})
       string(REPLACE "\n" ";" AGENTS_LIST ${AGENTS_STRING})
       list(FILTER AGENTS_LIST EXCLUDE REGEX "gfx000")
-      if (AGENTS_LIST STREQUAL "")
+
+      if(AGENTS_LIST STREQUAL "")
         message(SEND_ERROR "No non-CPU ROCm agents found on the system, and ROCM_TEST_CHIPSET is not defined")
       else()
         list(GET AGENTS_LIST 0 FIRST_AGENT)
@@ -342,27 +350,34 @@ if(LLVM_ENABLE_PIC)
     # Supress compiler warnings from HIP headers
     check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi
       CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG)
-    if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
+
+    if(CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-c++98-compat-extra-semi")
     endif()
+
     check_cxx_compiler_flag(-Wno-return-type-c-linkage
-        CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
-    if (CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+      CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+
+    if(CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-return-type-c-linkage")
     endif()
+
     check_cxx_compiler_flag(-Wno-nested-anon-types
       CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
-    if (CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
+
+    if(CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-nested-anon-types")
     endif()
+
     check_cxx_compiler_flag(-Wno-gnu-anonymous-struct
       CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
-    if (CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+
+    if(CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
-      "-Wno-gnu-anonymous-struct")
+        "-Wno-gnu-anonymous-struct")
     endif()
 
     set_property(TARGET mlir_rocm_runtime
@@ -381,9 +396,9 @@ if(LLVM_ENABLE_PIC)
       message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
     endif()
 
-    find_package(LevelZero)
+    find_package(LevelZeroRuntime)
 
-    if(NOT LevelZero_FOUND)
+    if(NOT LevelZeroRuntime_FOUND)
       message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
     endif()
 
@@ -395,18 +410,51 @@ if(LLVM_ENABLE_PIC)
     )
 
     check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+
     if(NOT CXX_HAS_FRTTI_FLAG)
       message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
     endif()
-    target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
+
+    target_compile_options(mlir_sycl_runtime PUBLIC -fexceptions -frtti)
 
     target_include_directories(mlir_sycl_runtime PRIVATE
       ${MLIR_INCLUDE_DIRS}
     )
 
-    target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+    target_link_libraries(mlir_sycl_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime SyclRuntime::SyclRuntime)
+
+    set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+  endif()
+
+  if(MLIR_ENABLE_LEVEL_ZERO_RUNNER)
+    find_package(LevelZeroRuntime)
+
+    if(NOT LevelZeroRuntime_FOUND)
+      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+    endif()
+
+    add_mlir_library(mlir_levelzero_runtime
+      SHARED
+      LevelZeroRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+
+    check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+
+    if(NOT CXX_HAS_FRTTI_FLAG)
+      message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+    endif()
+
+    target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti)
+
+    target_include_directories(mlir_levelzero_runtime PRIVATE
+      ${MLIR_INCLUDE_DIRS}
+    )
+
+    target_link_libraries(mlir_levelzero_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime)
 
-    set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+    set_property(TARGET mlir_levelzero_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}")
   endif()
 
   if(MLIR_ENABLE_SPIRV_CPU_RUNNER)
@@ -422,25 +470,26 @@ if(LLVM_ENABLE_PIC)
       mlir_spirv_cpu_runtime_EXPORTS)
   endif()
 
-  if (MLIR_ENABLE_VULKAN_RUNNER)
+  if(MLIR_ENABLE_VULKAN_RUNNER)
     find_package(Vulkan)
 
     # If Vulkan is not found try a path specified by VULKAN_SDK.
-    if (NOT Vulkan_FOUND)
-      if ("$ENV{VULKAN_SDK}" STREQUAL "")
+    if(NOT Vulkan_FOUND)
+      if("$ENV{VULKAN_SDK}" STREQUAL "")
         message(FATAL_ERROR "Vulkan not found through CMake; please provide "
-                            "VULKAN_SDK path as an environment variable")
+          "VULKAN_SDK path as an environment variable")
       endif()
 
       find_library(Vulkan_LIBRARY vulkan HINTS "$ENV{VULKAN_SDK}/lib" REQUIRED)
-      if (Vulkan_LIBRARY)
+
+      if(Vulkan_LIBRARY)
         set(Vulkan_FOUND ON)
         set(Vulkan_INCLUDE_DIR "$ENV{VULKAN_SDK}/include")
         message(STATUS "Found Vulkan: " ${Vulkan_LIBRARY})
       endif()
     endif()
 
-    if (NOT Vulkan_FOUND)
+    if(NOT Vulkan_FOUND)
       message(FATAL_ERROR "Cannot find Vulkan library")
     endif()
 
diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
new file mode 100644
index 0000000000000..70ac4761dc7fd
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -0,0 +1,491 @@
+//===- LevelZeroRuntimeWrappers.cpp - MLIR Level Zero (L0) wrapper library-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements wrappers around the Level Zero (L0) runtime library with C linkage
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+
+#include <cassert>
+#include <deque>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <level_zero/ze_api.h>
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+namespace {
+
+template <typename F>
+auto catchAll(F &&func) {
+  try {
+    return func();
+  } catch (const std::exception &e) {
+    std::cerr << "An exception was thrown: " << e.what() << std::endl;
+    std::abort();
+  } catch (...) {
+    std::cerr << "An unknown exception was thrown." << std::endl;
+    std::abort();
+  }
+}
+
+#define L0_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    ze_result_t status = (call);                                               \
+    if (status != ZE_RESULT_SUCCESS) {                                         \
+      std::cerr << "L0 error " << status << std::endl;                         \
+      std::abort();                                                            \
+    }                                                                          \
+  }
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// L0 RT context & device setters
+//===----------------------------------------------------------------------===//
+
+// Returns the L0 driver handle for the given index. Default index is 0
+// (i.e., returns the first driver handle of the available drivers).
+
+static ze_driver_handle_t getDriver(uint32_t idx = 0) {
+  ze_init_driver_type_desc_t driver_type = {};
+  driver_type.stype = ZE_STRUCTURE_TYPE_INIT_DRIVER_TYPE_DESC;
+  driver_type.flags = ZE_INIT_DRIVER_TYPE_FLAG_GPU;
+  driver_type.pNext = nullptr;
+  uint32_t driverCount{0};
+  thread_local static std::vector<ze_driver_handle_t> drivers;
+
+  thread_local static bool isDriverInitialised{false};
+  if (isDriverInitialised)
+    return drivers[idx];
+  L0_SAFE_CALL(zeInitDrivers(&driverCount, nullptr, &driver_type));
+  if (!driverCount)
+    throw std::runtime_error("No L0 drivers found.");
+  drivers.resize(driverCount);
+  L0_SAFE_CALL(zeInitDrivers(&driverCount, drivers.data(), &driver_type));
+  if (idx >= driverCount)
+    throw std::runtime_error((llvm::Twine("Requested driver idx out-of-bound, "
+                                          "number of availabe drivers: ") +
+                              std::to_string(driverCount))
+                                 .str());
+  isDriverInitialised = true;
+  return drivers[idx];
+}
+
+static ze_device_handle_t getDefaultDevice(const uint32_t driverIdx = 0,
+                                           const int32_t devIdx = 0) {
+  thread_local static ze_device_handle_t l0Device;
+  thread_local static int32_t currDevIdx{-1};
+  if (devIdx == currDevIdx)
+    return l0Device;
+  auto driver = getDriver(driverIdx);
+  uint32_t deviceCount{0};
+  L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr));
+  if (!deviceCount)
+    throw std::runtime_error(
+        "getDefaultDevice failed: did not find L0 device.");
+  if (static_cast<int>(deviceCount) < devIdx + 1)
+    throw std::runtime_error("getDefaultDevice failed: devIdx out-of-bounds.");
+  std::vector<ze_device_handle_t> devices(deviceCount);
+  L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data()));
+  l0Device = devices[devIdx];
+  currDevIdx = devIdx;
+  return l0Device;
+}
+
+// Returns the default L0 context of the defult driver.
+static ze_context_handle_t getDefaultContext() {
+  thread_local static ze_context_handle_t context;
+  thread_local static bool isContextInitialised{false};
+  if (isContextInitialised)
+    return context;
+  ze_context_desc_t ctxtDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+  auto driver = getDriver();
+  L0_SAFE_CALL(zeContextCreate(driver, &ctxtDesc, &context));
+  isContextInitialised = true;
+  return context;
+}
+
+//===----------------------------------------------------------------------===//
+// L0 RT helper structs
+//===----------------------------------------------------------------------===//
+
+struct L0RtContext {
+  ze_driver_handle_t driver{nullptr};
+  ze_device_handle_t device{nullptr};
+  ze_context_handle_t context{nullptr};
+  // Usually, one immediate command list with ordinal 0 suffices for
+  // both copy and compute ops, but leaves HW underutilized.
+  ze_command_list_handle_t immCmdListCompute{nullptr};
+  // Copy engines can be used for both memcpy and memset, but
+  // they have limitations for memset pattern size (e.g., 1 byte).
+  ze_command_list_handle_t immCmdListCopy{nullptr};
+  uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
+
+  L0RtContext(const int32_t devIdx = 0)
+      : driver(getDriver()), device(getDefaultDevice(devIdx)),
+        context(getDefaultContext()) {
+    uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u;
+    ze_device_properties_t deviceProperties = {};
+    L0_SAFE_CALL(zeDeviceGetProperties(device, &deviceProperties));
+    uint32_t queueGroupCount = 0;
+    L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
+        device, &queueGroupCount, nullptr));
+    std::vector<ze_command_queue_group_properties_t> queueGroupProperties(
+        queueGroupCount);
+    L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
+        device, &queueGroupCount, queueGroupProperties.data()));
+    for (uint32_t queueGroupIdx = 0; queueGroupIdx < queueGroupCount;
+         ++queueGroupIdx) {
+      const auto &group = queueGroupProperties[queueGroupIdx];
+      if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+        computeEngineOrdinal = queueGroupIdx;
+      else if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) {
+        copyEngineOrdinal = queueGroupIdx;
+        copyEngineMaxMemoryFillPatternSize = group.maxMemoryFillPatternSize;
+      }
+      if (copyEngineOrdinal != -1u && computeEngineOrdinal != -1u)
+        break;
+    }
+    // Fallback to the default queue if no dedicated copy queue is available.
+    if (copyEngineOrdinal == -1u)
+      copyEngineOrdinal = computeEngineOrdinal;
+    assert(copyEngineOrdinal != -1u && computeEngineOrdinal != -1u &&
+           "Expected two engines to be available.");
+    ze_command_queue_desc_t cmdQueueDesc{
+        ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+        nullptr,
+        copyEngineOrdinal, // ordinal
+        0,                 // index (assume one physical engine in the group)
+        0,                 // flags
+        ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+        ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+    L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
+                                              &immCmdListCopy));
+    cmdQueueDesc.ordinal = computeEngineOrdinal;
+    L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
+                                              &immCmdListCompute));
+  }
+  void cleanup() {
+    L0_SAFE_CALL(zeCommandListDestroy(immCmdListCopy));
+    L0_SAFE_CALL(zeCommandListDestroy(immCmdListCompute));
+    L0_SAFE_CALL(zeContextDestroy(context));
+  }
+  ~L0RtContext() { cleanup(); }
+};
+
+// L0 only supports pre-determined sizes of event pools,
+// implement a rt data struct to avoid running out of events.
+struct DynamicEventPool {
+  constexpr static size_t numEventsPerPool{128};
+  std::vector<ze_event_pool_handle_t> eventPools;
+  std::vector<ze_event_handle_t> availableEvents;
+  std::unordered_set<ze_event_handle_t> takenEvents;
+  size_t currentEventsLimit{0};
+  size_t currentEventsCnt{0};
+  L0RtContext *rtCtx;
+
+  DynamicEventPool(L0RtContext *rtCtx) : rtCtx(rtCtx) {
+    createNewPool(numEventsPerPool);
+  }
+
+  ~DynamicEventPool() {
+    assert(!takenEvents.size());
+    // zeEventDestroy will trigger L0_SAFE_CALL if an event is still used by
+    // device
+    for (auto event : availableEvents)
+      L0_SAFE_CALL(zeEventDestroy(event));
+    for (auto pool : eventPools)
+      L0_SAFE_CALL(zeEventPoolDestroy(pool));
+  }
+
+  void createNewPool(size_t numEvents) {
+    ze_event_pool_desc_t eventPoolDesc = {};
+    eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+    eventPoolDesc.count = numEvents;
+    eventPools.push_back(nullptr);
+    L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context, &eventPoolDesc, 1,
+                                   &rtCtx->device, &eventPools.back()));
+    currentEventsLimit += numEvents;
+  }
+
+  ze_event_handle_t takeEvent() {
+    ze_event_handle_t event{nullptr};
+    if (availableEvents.size()) {
+      event = availableEvents.back();
+      availableEvents.pop_back();
+    } else {
+      if (currentEventsCnt == currentEventsLimit)
+        createNewPool(numEventsPerPool);
+      currentEventsCnt++;
+      ze_event_desc_t eventDesc = {
+          ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
+          static_cast<uint32_t>(currentEventsCnt % numEventsPerPool),
+          ZE_EVENT_SCOPE_FLAG_DEVICE, ZE_EVENT_SCOPE_FLAG_HOST};
+      L0_SAFE_CALL(zeEventCreate(eventPools.back(), &eventDesc, &event));
+    }
+    takenEvents.insert(event);
+    return event;
+  }
+
+  void releaseEvent(ze_event_handle_t event) {
+    auto found = takenEvents.find(event);
+    assert(found != takenEvents.end());
+    takenEvents.erase(found);
+    L0_SAFE_CALL(zeEventHostReset(event));
+    availableEvents.push_back(event);
+  }
+};
+
+L0RtContext &getRtContext() {
+  thread_local static L0RtContext rtContext;
+  return rtContext;
+}
+
+DynamicEventPool &getDynamicEventPool() {
+  thread_local static DynamicEventPool dynEventPool{&getRtContext()};
+  return dynEventPool;
+}
+
+struct StreamWrapper {
+  // avoid event pointer invalidations
+  std::deque<ze_event_handle_t> implicitEventStack;
+  DynamicEventPool &dynEventPool;
+
+  StreamWrapper(DynamicEventPool &dynEventPool) : dynEventPool(dynEventPool) {}
+  ~StreamWrapper() { sync(); }
+
+  ze_event_handle_t *getLastImplicitEventPtr() {
+    // Assume current implicit events will not be used after `sync`.
+    return implicitEventStack.size() ? &implicitEventStack.back() : nullptr;
+  }
+
+  void sync(ze_event_handle_t explicitEvent = nullptr) {
+    ze_event_handle_t syncEvent{nullptr};
+    if (!explicitEvent) {
+      ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
+      syncEvent = lastImplicitEventPtr ? *lastImplicitEventPtr : nullptr;
+    } else {
+      syncEvent = explicitEvent;
+    }
+    if (syncEvent)
+      L0_SAFE_CALL(zeEventHostSynchronize(
+          syncEvent, std::numeric_limits<uint64_t>::max()));
+    // All of the "implicit" events were signaled and are of no use, release
+    // them. "explicit" event must be "released" via mgpuEventDestroy
+    for (auto event : implicitEventStack)
+      dynEventPool.releaseEvent(event);
+    implicitEventStack.clear();
+  }
+
+  void enqueueOp(
+      std::function<void(ze_event_handle_t, uint32_t, ze_event_handle_t *)>
+          op) {
+    ze_event_handle_t newImplicitEvent = dynEventPool.takeEvent();
+    ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
+    const uint32_t numWaitEvents = lastImplicitEventPtr ? 1 : 0;
+    op(newImplicitEvent, numWaitEvents, lastImplicitEventPtr);
+    implicitEventStack.push_back(newImplicitEvent);
+  }
+};
+
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
+  assert(data);
+  ze_module_handle_t zeModule;
+  ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+                           nullptr,
+                           ZE_MODULE_FORMAT_IL_SPIRV,
+                           dataSize,
+                           (const uint8_t *)data,
+                           nullptr,
+                           nullptr};
+  ze_module_build_log_handle_t buildLogHandle;
+  ze_result_t result =
+      zeModuleCreate(getRtContext().context, getRtContext().device, &desc,
+                     &zeModule, &buildLogHandle);
+  if (result != ZE_RESULT_SUCCESS) {
+    std::cerr << "Error creating module, error code: " << result << std::endl;
+    size_t logSize = 0;
+    L0_SAFE_CALL(zeModuleBuildLogGetString(buildLogHandle, &logSize, nullptr));
+    std::string buildLog(" ", logSize);
+    L0_SAFE_CALL(
+        zeModuleBuildLogGetString(buildLogHandle, &logSize, buildLog.data()));
+    std::cerr << "Build log:\n" << buildLog << std::endl;
+    std::abort();
+  }
+  return zeModule;
+}
+
+//===----------------------------------------------------------------------===//
+// L0 Wrappers definition
+//===----------------------------------------------------------------------===//
+
+extern "C" StreamWrapper *mgpuStreamCreate() {
+  return new StreamWrapper(getDynamicEventPool());
+}
+
+extern "C" void mgpuStreamSynchronize(StreamWrapper *stream) {
+  if (stream)
+    stream->sync();
+}
+
+extern "C" void mgpuStreamDestroy(StreamWrapper *stream) {
+  if (stream)
+    delete stream;
+}
+
+extern "C" void mgpuStreamWaitEvent(StreamWrapper *stream,
+                                    ze_event_handle_t event) {
+  assert(stream && event);
+  stream->sync(event);
+}
+
+extern "C" ze_event_handle_t mgpuEventCreate() {
+  return getDynamicEventPool().takeEvent();
+}
+
+extern "C" void mgpuEventDestroy(ze_event_handle_t event) {
+  return getDynamicEventPool().releaseEvent(event);
+}
+
+extern "C" void mgpuEventSynchronize(ze_event_handle_t event) {
+  L0_SAFE_CALL(
+      zeEventHostSynchronize(event, std::numeric_limits<uint64_t>::max()));
+  L0_SAFE_CALL(zeEventHostReset(event));
+}
+
+extern "C" void mgpuEventRecord(ze_event_handle_t event,
+                                StreamWrapper *stream) {
+  L0_SAFE_CALL(
+      zeCommandListAppendSignalEvent(getRtContext().immCmdListCopy, event));
+  L0_SAFE_CALL(
+      zeCommandListAppendSignalEvent(getRtContext().immCmdListCompute, event));
+}
+
+extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
+                              bool isShared) {
+  return catchAll([&]() {
+    void *memPtr = nullptr;
+    constexpr size_t alignment{64};
+    ze_device_mem_alloc_desc_t deviceDesc = {};
+    deviceDesc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+    if (isShared) {
+      ze_host_mem_alloc_desc_t hostDesc = {};
+      hostDesc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+      L0_SAFE_CALL(zeMemAllocShared(getRtContext().context, &deviceDesc,
+                                    &hostDesc, size, alignment,
+                                    getRtContext().device, &memPtr));
+    } else {
+      L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context, &deviceDesc, size,
+                                    alignment, getRtContext().device, &memPtr));
+    }
+    if (!memPtr)
+      throw std::runtime_error("mem allocation failed!");
+    return memPtr;
+  });
+}
+
+extern "C" void mgpuMemFree(void *ptr, StreamWrapper *stream) {
+  stream->sync();
+  if (ptr)
+    L0_SAFE_CALL(zeMemFree(getRtContext().context, ptr));
+}
+
+extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
+                           StreamWrapper *stream) {
+  stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+                        ze_event_handle_t *waitEvents) {
+    L0_SAFE_CALL(zeCommandListAppendMemoryCopy(getRtContext().immCmdListCopy,
+                                               dst, src, sizeBytes, newEvent,
+                                               numWaitEvents, waitEvents));
+  });
+}
+
+template <typename PATTERN_TYPE>
+void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
+                StreamWrapper *stream) {
+  auto listType =
+      getRtContext().copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
+          ? getRtContext().immCmdListCopy
+          : getRtContext().immCmdListCompute;
+  stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+                        ze_event_handle_t *waitEvents) {
+    L0_SAFE_CALL(zeCommandListAppendMemoryFill(
+        listType, dst, &value, sizeof(PATTERN_TYPE),
+        count * sizeof(PATTERN_TYPE), newEvent, numWaitEvents, waitEvents));
+  });
+}
+extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
+                             StreamWrapper *stream) {
+  mgpuMemset<unsigned int>(dst, value, count, stream);
+}
+
+extern "C" void mgpuMemset16(void *dst, unsigned short value, size_t count,
+                             StreamWrapper *stream) {
+  mgpuMemset<unsigned short>(dst, value, count, stream);
+}
+
+extern "C" ze_module_handle_t mgpuModuleLoad(const void *data,
+                                             size_t gpuBlobSize) {
+  return catchAll([&]() { return loadModule(data, gpuBlobSize); });
+}
+
+extern "C" ze_kernel_handle_t mgpuModuleGetFunction(ze_module_handle_t module,
+                                                    const char *name) {
+  assert(module && name);
+  ze_kernel_handle_t zeKernel;
+  ze_kernel_desc_t desc = {};
+  desc.pKernelName = name;
+  L0_SAFE_CALL(zeKernelCreate(module, &desc, &zeKernel));
+  return zeKernel;
+}
+
+extern "C" void mgpuLaunchKernel(ze_kernel_handle_t kernel, size_t gridX,
+                                 size_t gridY, size_t gridZ, size_t blockX,
+                                 size_t blockY, size_t blockZ,
+                                 size_t sharedMemBytes, StreamWrapper *stream,
+                                 void **params, void ** /*extra*/,
+                                 size_t paramsCount) {
+
+  if (sharedMemBytes > 0) {
+    paramsCount = paramsCount - 1; // Last param is shared memory size
+    L0_SAFE_CALL(
+        zeKernelSetArgumentValue(kernel, paramsCount, sharedMemBytes, nullptr));
+  }
+  for (size_t i = 0; i < paramsCount; ++i)
+    L0_SAFE_CALL(zeKernelSetArgumentValue(kernel, static_cast<uint32_t>(i),
+                                          sizeof(void *), params[i]));
+  L0_SAFE_CALL(zeKernelSetGroupSize(kernel, blockX, blockY, blockZ));
+  ze_group_count_t dispatch;
+  dispatch.groupCountX = static_cast<uint32_t>(gridX);
+  dispatch.groupCountY = static_cast<uint32_t>(gridY);
+  dispatch.groupCountZ = static_cast<uint32_t>(gridZ);
+  stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+                        ze_event_handle_t *waitEvents) {
+    L0_SAFE_CALL(zeCommandListAppendLaunchKernel(
+        getRtContext().immCmdListCompute, kernel, &dispatch, newEvent,
+        numWaitEvents, waitEvents));
+  });
+}
+
+extern "C" void mgpuModuleUnload(ze_module_handle_t module) {
+  L0_SAFE_CALL(zeModuleDestroy(module));
+}
+
+extern "C" void mgpuSetDefaultDevice(int32_t devIdx) {
+  catchAll([&]() {
+    // For now, a user must ensure that streams and events complete
+    // and are destroyed before switching a device.
+    getRtContext().cleanup();
+    getRtContext() = L0RtContext(devIdx);
+    getDynamicEventPool() = DynamicEventPool(&getRtContext());
+  });
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
new file mode 100644
index 0000000000000..e4b566b74c862
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+  memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+    %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+    %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+    %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %mem = gpu.alloc host_shared () : memref<2x2x2xf32>
+    memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32>
+    %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
+    memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
+    %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
+    %2 = gpu.wait async
+    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+    gpu.wait [%3]
+    %alloc = memref.alloc() : memref<2x2x2xf32>
+    memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
+    %4 = gpu.wait async
+    %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
+    %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
+    %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
+    gpu.wait [%7]
+    return %alloc : memref<2x2x2xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+      %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+      %5 = arith.addf %3, %4 : f32
+      memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+      gpu.return
+    }
+  }
+  // CHECK: [2.3, 4.5]
+  // CHECK: [7.8, 10.2]
+  // CHECK: [12.7, 14.9]
+  // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
new file mode 100644
index 0000000000000..c5aecd569ea75
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+  memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+    %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+    %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+    %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+    call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+    return
+  }
+  func.func private @printMemrefI64(memref<*xi64>)
+  func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %mem = gpu.alloc host_shared () : memref<3x3xi64>
+    memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
+    %memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
+    memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+    %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
+    %2 = gpu.wait async
+    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+    gpu.wait [%3]
+    %alloc = memref.alloc() : memref<3x3xi64>
+    memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+    %4 = gpu.wait async
+    %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+    %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+    %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
+    gpu.wait [%7]
+    return %alloc : memref<3x3xi64>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+      %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+      %4 = arith.addi %2, %3 : i64
+      memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+      gpu.return
+    }
+  }
+  // CHECK: [2,   4100,   6],
+  // CHECK: [16777224,   10,   4294971404],
+  // CHECK: [16777230,   1103806595088,   1099511627794]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
new file mode 100644
index 0000000000000..94a27906ed1f3
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+  memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+    %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+    %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+    %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    memref.dealloc %2 : memref<2x2x2xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc () : memref<2x2x2xf32>
+    gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    %memref_0 = gpu.alloc () : memref<2x2x2xf32>
+    gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    %memref_1 = gpu.alloc () : memref<2x2x2xf32>
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
+    %alloc = memref.alloc() : memref<2x2x2xf32>
+    gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    gpu.dealloc %memref_1 : memref<2x2x2xf32>
+    gpu.dealloc %memref_0 : memref<2x2x2xf32>
+    gpu.dealloc %memref : memref<2x2x2xf32>
+    return %alloc : memref<2x2x2xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+      %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+      %5 = arith.addf %3, %4 : f32
+      memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+      gpu.return
+    }
+  }
+  // CHECK: [2.3, 4.5]
+  // CHECK: [7.8, 10.2]
+  // CHECK: [12.7, 14.9]
+  // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
new file mode 100644
index 0000000000000..e385daefcb9b5
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @relu attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[
+    [-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01],
+    [1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01],
+    [1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01],
+    [1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01]
+  ]>
+
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+    %c100 = arith.constant 100 : index
+    %c0 = arith.constant 0 : index
+    %0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32>
+
+    scf.for %arg0 = %c0 to %c100 step %c1 {
+      %1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32>
+      %cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32>
+      func.call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+      // CHECK: [0, 0, 0, 0.4, 0.5],
+      // CHECK: [0.1, 0, 0.3, 0, 0.5],
+      // CHECK: [0.1, 0.2, 0.3, 0, 0],
+      // CHECK: [0.1, 0.2, 0.3, 0.4, 0.5]
+    }
+    return
+  }
+
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> {
+    %c5 = arith.constant 5 : index
+    %c4 = arith.constant 4 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc host_shared () : memref<4x5xf32>
+    memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
+    %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
+    %2 = gpu.wait async
+    %3 = gpu.launch_func async [%2]  @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
+    gpu.wait [%3]
+    %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
+    %4 = gpu.wait async
+    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
+    gpu.wait [%5]
+    %alloc = memref.alloc() : memref<4x5xf32>
+    memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
+    %6 = gpu.wait async
+    %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
+    %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
+    %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
+    return %alloc : memref<4x5xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
+      %3 = arith.cmpf olt, %2, %arg1 : f32
+      memref.store %3, %arg2[%0, %1] : memref<4x5xi1>
+      gpu.return
+    }
+  }
+  gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
+      %3 = memref.load %arg1[%0, %1] : memref<4x5xf32>
+      %4 = arith.select %2, %arg2, %3 : f32
+      memref.store %4, %arg3[%0, %1] : memref<4x5xf32>
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg b/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
new file mode 100644
index 0000000000000..36c7ad5f57c7e
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_levelzero_runner:
+    config.unsupported = True

>From 72b2774ecf82643b23a09c349310806ee13651ea Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Mon, 28 Jul 2025 21:25:41 +0000
Subject: [PATCH 02/12] Fix CMake options.

---
 mlir/CMakeLists.txt                     | 1 +
 mlir/lib/ExecutionEngine/CMakeLists.txt | 2 +-
 mlir/test/CMakeLists.txt                | 4 ++++
 mlir/test/lit.cfg.py                    | 3 +++
 mlir/test/lit.site.cfg.py.in            | 1 +
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index a1ad81f625cd6..a9414eb324d09 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -140,6 +140,7 @@ endif()
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
 set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
+set(MLIR_ENABLE_LEVELZERO_RUNNER 0 CACHE BOOL "Enable building the MLIR LevelZero runner")
 set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the MLIR SPIR-V cpu runner")
 set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the MLIR Vulkan runner")
 set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 06c879f082926..136f94a55d0fe 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -426,7 +426,7 @@ if(LLVM_ENABLE_PIC)
     set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
   endif()
 
-  if(MLIR_ENABLE_LEVEL_ZERO_RUNNER)
+  if(MLIR_ENABLE_LEVELZERO_RUNNER)
     find_package(LevelZeroRuntime)
 
     if(NOT LevelZeroRuntime_FOUND)
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 89568e7766ae5..a4a942de3c9a7 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -167,6 +167,10 @@ if(MLIR_ENABLE_SYCL_RUNNER)
   list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
 endif()
 
+if(MLIR_ENABLE_LEVELZERO_RUNNER)
+  list(APPEND MLIR_TEST_DEPENDS mlir_levelzero_runtime)
+endif()
+
 if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB)
   list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs)
 endif()
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index feaf5fb852a1d..f392bdacadd3c 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -224,6 +224,9 @@ def find_real_python_interpreter():
 if config.enable_sycl_runner:
     tools.extend([add_runtime("mlir_sycl_runtime")])
 
+if config.enable_levelzero_runner:
+    tools.extend([add_runtime("mlir_levelzero_runtime")])
+
 if config.enable_spirv_cpu_runner:
     tools.extend([add_runtime("mlir_spirv_cpu_runtime")])
 
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index b1185e19d86e8..d904780af4224 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -34,6 +34,7 @@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
 config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
+config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
 config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@

>From c297c8a1e5823f7a66bc457cb413cac568b6e352 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Tue, 29 Jul 2025 17:39:54 +0000
Subject: [PATCH 03/12] Address review comments.

---
 mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp |  2 +-
 .../GPU/LEVELZERO/gpu-addf32-to-spirv.mlir            |  4 +++-
 .../GPU/LEVELZERO/gpu-addi64-to-spirv.mlir            |  4 +++-
 .../GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir     |  4 +++-
 .../GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir           | 11 ++++++++---
 5 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 70ac4761dc7fd..9d389d9f101ad 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -12,12 +12,12 @@
 
 #include "llvm/ADT/Twine.h"
 
+#include "level_zero/ze_api.h"
 #include <cassert>
 #include <deque>
 #include <exception>
 #include <functional>
 #include <iostream>
-#include <level_zero/ze_api.h>
 #include <limits>
 #include <unordered_set>
 #include <vector>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
index e4b566b74c862..82cefb4e3279b 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_levelzero_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -26,7 +26,9 @@ module @add attributes {gpu.container_module} {
     memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
     %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
     %2 = gpu.wait async
+
     %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+
     gpu.wait [%3]
     %alloc = memref.alloc() : memref<2x2x2xf32>
     memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
index c5aecd569ea75..61400874b3716 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_levelzero_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -26,7 +26,9 @@ module @add attributes {gpu.container_module} {
     memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
     %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
     %2 = gpu.wait async
+
     %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+
     gpu.wait [%3]
     %alloc = memref.alloc() : memref<3x3xi64>
     memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
index 94a27906ed1f3..3531d14a98d16 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_levelzero_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -26,7 +26,9 @@ module @add attributes {gpu.container_module} {
     %memref_0 = gpu.alloc () : memref<2x2x2xf32>
     gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
     %memref_1 = gpu.alloc () : memref<2x2x2xf32>
+
     gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
+
     %alloc = memref.alloc() : memref<2x2x2xf32>
     gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
     gpu.dealloc %memref_1 : memref<2x2x2xf32>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
index e385daefcb9b5..f935a04a8c7a3 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
 // RUN: | mlir-runner \
-// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
@@ -41,11 +41,16 @@ module @relu attributes {gpu.container_module} {
     memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
     %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
     %2 = gpu.wait async
+
     %3 = gpu.launch_func async [%2]  @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
+
     gpu.wait [%3]
     %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
     %4 = gpu.wait async
-    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
+
+    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32,
+
+    %memref_1 : memref<4x5xf32>)
     gpu.wait [%5]
     %alloc = memref.alloc() : memref<4x5xf32>
     memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>

>From c943f95d4a66fc7e2445177f9a2f58f9f288c4a4 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 03:47:59 +0000
Subject: [PATCH 04/12] Use std::unique_ptr for L0 handles.

Fix Cmake whitespace isseu.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt       |   2 +-
 .../LevelZeroRuntimeWrappers.cpp              | 123 +++++++++++-------
 2 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 136f94a55d0fe..f0d590ce1f85f 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -512,4 +512,4 @@ if(LLVM_ENABLE_PIC)
       ${Vulkan_LIBRARY}
     )
   endif()
-endif()
+endif()
\ No newline at end of file
diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 9d389d9f101ad..729bc76978956 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -45,7 +45,6 @@ auto catchAll(F &&func) {
       std::abort();                                                            \
     }                                                                          \
   }
-
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -118,23 +117,49 @@ static ze_context_handle_t getDefaultContext() {
 // L0 RT helper structs
 //===----------------------------------------------------------------------===//
 
+struct ZeContextDeleter {
+  void operator()(ze_context_handle_t ctx) const {
+    if (ctx)
+      L0_SAFE_CALL(zeContextDestroy(ctx));
+  }
+};
+
+struct ZeCommandListDeleter {
+  void operator()(ze_command_list_handle_t cmdList) const {
+    if (cmdList)
+      L0_SAFE_CALL(zeCommandListDestroy(cmdList));
+  }
+};
+
 struct L0RtContext {
   ze_driver_handle_t driver{nullptr};
   ze_device_handle_t device{nullptr};
-  ze_context_handle_t context{nullptr};
+  using UniqueZeContext =
+      std::unique_ptr<std::remove_pointer<ze_context_handle_t>::type,
+                      ZeContextDeleter>;
+  UniqueZeContext context;
+
   // Usually, one immediate command list with ordinal 0 suffices for
   // both copy and compute ops, but leaves HW underutilized.
-  ze_command_list_handle_t immCmdListCompute{nullptr};
+  using UniqueZeCommandList =
+      std::unique_ptr<std::remove_pointer<ze_command_list_handle_t>::type,
+                      ZeCommandListDeleter>;
+  UniqueZeCommandList immCmdListCompute;
   // Copy engines can be used for both memcpy and memset, but
   // they have limitations for memset pattern size (e.g., 1 byte).
-  ze_command_list_handle_t immCmdListCopy{nullptr};
+  UniqueZeCommandList immCmdListCopy;
   uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
 
+  L0RtContext() = default;
   L0RtContext(const int32_t devIdx = 0)
-      : driver(getDriver()), device(getDefaultDevice(devIdx)),
-        context(getDefaultContext()) {
+      : driver(getDriver()), device(getDefaultDevice(devIdx)) {
+    // Create context
+    ze_context_handle_t defaultCtx = getDefaultContext();
+    context.reset(defaultCtx);
+
+    // Determine ordinals
     uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u;
-    ze_device_properties_t deviceProperties = {};
+    ze_device_properties_t deviceProperties{};
     L0_SAFE_CALL(zeDeviceGetProperties(device, &deviceProperties));
     uint32_t queueGroupCount = 0;
     L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
@@ -143,6 +168,7 @@ struct L0RtContext {
         queueGroupCount);
     L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
         device, &queueGroupCount, queueGroupProperties.data()));
+
     for (uint32_t queueGroupIdx = 0; queueGroupIdx < queueGroupCount;
          ++queueGroupIdx) {
       const auto &group = queueGroupProperties[queueGroupIdx];
@@ -155,11 +181,15 @@ struct L0RtContext {
       if (copyEngineOrdinal != -1u && computeEngineOrdinal != -1u)
         break;
     }
+
     // Fallback to the default queue if no dedicated copy queue is available.
     if (copyEngineOrdinal == -1u)
       copyEngineOrdinal = computeEngineOrdinal;
+
     assert(copyEngineOrdinal != -1u && computeEngineOrdinal != -1u &&
            "Expected two engines to be available.");
+
+    // Create copy command list
     ze_command_queue_desc_t cmdQueueDesc{
         ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
         nullptr,
@@ -168,18 +198,25 @@ struct L0RtContext {
         0,                 // flags
         ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
         ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
-    L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
-                                              &immCmdListCopy));
+
+    ze_command_list_handle_t rawCmdListCopy = nullptr;
+    L0_SAFE_CALL(zeCommandListCreateImmediate(context.get(), device,
+                                              &cmdQueueDesc, &rawCmdListCopy));
+    immCmdListCopy.reset(rawCmdListCopy);
+
+    // Create compute command list
     cmdQueueDesc.ordinal = computeEngineOrdinal;
-    L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
-                                              &immCmdListCompute));
-  }
-  void cleanup() {
-    L0_SAFE_CALL(zeCommandListDestroy(immCmdListCopy));
-    L0_SAFE_CALL(zeCommandListDestroy(immCmdListCompute));
-    L0_SAFE_CALL(zeContextDestroy(context));
+    ze_command_list_handle_t rawCmdListCompute = nullptr;
+    L0_SAFE_CALL(zeCommandListCreateImmediate(
+        context.get(), device, &cmdQueueDesc, &rawCmdListCompute));
+    immCmdListCompute.reset(rawCmdListCompute);
   }
-  ~L0RtContext() { cleanup(); }
+  L0RtContext(const L0RtContext &) = delete;
+  L0RtContext &operator=(const L0RtContext &) = delete;
+  // Allow move
+  L0RtContext(L0RtContext &&) noexcept = default;
+  L0RtContext &operator=(L0RtContext &&) noexcept = default;
+  ~L0RtContext() = default;
 };
 
 // L0 only supports pre-determined sizes of event pools,
@@ -212,7 +249,7 @@ struct DynamicEventPool {
     eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
     eventPoolDesc.count = numEvents;
     eventPools.push_back(nullptr);
-    L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context, &eventPoolDesc, 1,
+    L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context.get(), &eventPoolDesc, 1,
                                    &rtCtx->device, &eventPools.back()));
     currentEventsLimit += numEvents;
   }
@@ -246,7 +283,7 @@ struct DynamicEventPool {
 };
 
 L0RtContext &getRtContext() {
-  thread_local static L0RtContext rtContext;
+  thread_local static L0RtContext rtContext(0);
   return rtContext;
 }
 
@@ -286,13 +323,13 @@ struct StreamWrapper {
     implicitEventStack.clear();
   }
 
-  void enqueueOp(
-      std::function<void(ze_event_handle_t, uint32_t, ze_event_handle_t *)>
-          op) {
+  template <typename Func>
+  void enqueueOp(Func &&op) {
     ze_event_handle_t newImplicitEvent = dynEventPool.takeEvent();
     ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
     const uint32_t numWaitEvents = lastImplicitEventPtr ? 1 : 0;
-    op(newImplicitEvent, numWaitEvents, lastImplicitEventPtr);
+    std::forward<Func>(op)(newImplicitEvent, numWaitEvents,
+                           lastImplicitEventPtr);
     implicitEventStack.push_back(newImplicitEvent);
   }
 };
@@ -309,7 +346,7 @@ static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
                            nullptr};
   ze_module_build_log_handle_t buildLogHandle;
   ze_result_t result =
-      zeModuleCreate(getRtContext().context, getRtContext().device, &desc,
+      zeModuleCreate(getRtContext().context.get(), getRtContext().device, &desc,
                      &zeModule, &buildLogHandle);
   if (result != ZE_RESULT_SUCCESS) {
     std::cerr << "Error creating module, error code: " << result << std::endl;
@@ -337,14 +374,12 @@ extern "C" void mgpuStreamSynchronize(StreamWrapper *stream) {
     stream->sync();
 }
 
-extern "C" void mgpuStreamDestroy(StreamWrapper *stream) {
-  if (stream)
-    delete stream;
-}
+extern "C" void mgpuStreamDestroy(StreamWrapper *stream) { delete stream; }
 
 extern "C" void mgpuStreamWaitEvent(StreamWrapper *stream,
                                     ze_event_handle_t event) {
-  assert(stream && event);
+  assert(stream && "Invalid stream");
+  assert(event && "Invalid event");
   stream->sync(event);
 }
 
@@ -364,10 +399,10 @@ extern "C" void mgpuEventSynchronize(ze_event_handle_t event) {
 
 extern "C" void mgpuEventRecord(ze_event_handle_t event,
                                 StreamWrapper *stream) {
-  L0_SAFE_CALL(
-      zeCommandListAppendSignalEvent(getRtContext().immCmdListCopy, event));
-  L0_SAFE_CALL(
-      zeCommandListAppendSignalEvent(getRtContext().immCmdListCompute, event));
+  L0_SAFE_CALL(zeCommandListAppendSignalEvent(
+      getRtContext().immCmdListCopy.get(), event));
+  L0_SAFE_CALL(zeCommandListAppendSignalEvent(
+      getRtContext().immCmdListCompute.get(), event));
 }
 
 extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
@@ -380,12 +415,13 @@ extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
     if (isShared) {
       ze_host_mem_alloc_desc_t hostDesc = {};
       hostDesc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
-      L0_SAFE_CALL(zeMemAllocShared(getRtContext().context, &deviceDesc,
+      L0_SAFE_CALL(zeMemAllocShared(getRtContext().context.get(), &deviceDesc,
                                     &hostDesc, size, alignment,
                                     getRtContext().device, &memPtr));
     } else {
-      L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context, &deviceDesc, size,
-                                    alignment, getRtContext().device, &memPtr));
+      L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context.get(), &deviceDesc,
+                                    size, alignment, getRtContext().device,
+                                    &memPtr));
     }
     if (!memPtr)
       throw std::runtime_error("mem allocation failed!");
@@ -396,16 +432,16 @@ extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
 extern "C" void mgpuMemFree(void *ptr, StreamWrapper *stream) {
   stream->sync();
   if (ptr)
-    L0_SAFE_CALL(zeMemFree(getRtContext().context, ptr));
+    L0_SAFE_CALL(zeMemFree(getRtContext().context.get(), ptr));
 }
 
 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
                            StreamWrapper *stream) {
   stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
                         ze_event_handle_t *waitEvents) {
-    L0_SAFE_CALL(zeCommandListAppendMemoryCopy(getRtContext().immCmdListCopy,
-                                               dst, src, sizeBytes, newEvent,
-                                               numWaitEvents, waitEvents));
+    L0_SAFE_CALL(zeCommandListAppendMemoryCopy(
+        getRtContext().immCmdListCopy.get(), dst, src, sizeBytes, newEvent,
+        numWaitEvents, waitEvents));
   });
 }
 
@@ -414,8 +450,8 @@ void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
                 StreamWrapper *stream) {
   auto listType =
       getRtContext().copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
-          ? getRtContext().immCmdListCopy
-          : getRtContext().immCmdListCompute;
+          ? getRtContext().immCmdListCopy.get()
+          : getRtContext().immCmdListCompute.get();
   stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
                         ze_event_handle_t *waitEvents) {
     L0_SAFE_CALL(zeCommandListAppendMemoryFill(
@@ -471,7 +507,7 @@ extern "C" void mgpuLaunchKernel(ze_kernel_handle_t kernel, size_t gridX,
   stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
                         ze_event_handle_t *waitEvents) {
     L0_SAFE_CALL(zeCommandListAppendLaunchKernel(
-        getRtContext().immCmdListCompute, kernel, &dispatch, newEvent,
+        getRtContext().immCmdListCompute.get(), kernel, &dispatch, newEvent,
         numWaitEvents, waitEvents));
   });
 }
@@ -484,7 +520,6 @@ extern "C" void mgpuSetDefaultDevice(int32_t devIdx) {
   catchAll([&]() {
     // For now, a user must ensure that streams and events complete
     // and are destroyed before switching a device.
-    getRtContext().cleanup();
     getRtContext() = L0RtContext(devIdx);
     getDynamicEventPool() = DynamicEventPool(&getRtContext());
   });

>From 7df2e28511da865ed6190dedae5171941000b58c Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 16:39:53 +0000
Subject: [PATCH 05/12] Use std::unique_ptr for DynamicEventPool members.

---
 .../LevelZeroRuntimeWrappers.cpp              | 110 ++++++++++++------
 1 file changed, 75 insertions(+), 35 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 729bc76978956..073605bf482c6 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -61,9 +61,8 @@ static ze_driver_handle_t getDriver(uint32_t idx = 0) {
   driver_type.pNext = nullptr;
   uint32_t driverCount{0};
   thread_local static std::vector<ze_driver_handle_t> drivers;
-
   thread_local static bool isDriverInitialised{false};
-  if (isDriverInitialised)
+  if (isDriverInitialised && idx < drivers.size())
     return drivers[idx];
   L0_SAFE_CALL(zeInitDrivers(&driverCount, nullptr, &driver_type));
   if (!driverCount)
@@ -83,7 +82,8 @@ static ze_device_handle_t getDefaultDevice(const uint32_t driverIdx = 0,
                                            const int32_t devIdx = 0) {
   thread_local static ze_device_handle_t l0Device;
   thread_local static int32_t currDevIdx{-1};
-  if (devIdx == currDevIdx)
+  thread_local static uint32_t currDriverIdx{0};
+  if (currDriverIdx == driverIdx && currDevIdx == devIdx)
     return l0Device;
   auto driver = getDriver(driverIdx);
   uint32_t deviceCount{0};
@@ -96,6 +96,7 @@ static ze_device_handle_t getDefaultDevice(const uint32_t driverIdx = 0,
   std::vector<ze_device_handle_t> devices(deviceCount);
   L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data()));
   l0Device = devices[devIdx];
+  currDriverIdx = driverIdx;
   currDevIdx = devIdx;
   return l0Device;
 }
@@ -130,20 +131,18 @@ struct ZeCommandListDeleter {
       L0_SAFE_CALL(zeCommandListDestroy(cmdList));
   }
 };
-
+using UniqueZeContext =
+    std::unique_ptr<std::remove_pointer<ze_context_handle_t>::type,
+                    ZeContextDeleter>;
+using UniqueZeCommandList =
+    std::unique_ptr<std::remove_pointer<ze_command_list_handle_t>::type,
+                    ZeCommandListDeleter>;
 struct L0RtContext {
   ze_driver_handle_t driver{nullptr};
   ze_device_handle_t device{nullptr};
-  using UniqueZeContext =
-      std::unique_ptr<std::remove_pointer<ze_context_handle_t>::type,
-                      ZeContextDeleter>;
   UniqueZeContext context;
-
   // Usually, one immediate command list with ordinal 0 suffices for
   // both copy and compute ops, but leaves HW underutilized.
-  using UniqueZeCommandList =
-      std::unique_ptr<std::remove_pointer<ze_command_list_handle_t>::type,
-                      ZeCommandListDeleter>;
   UniqueZeCommandList immCmdListCompute;
   // Copy engines can be used for both memcpy and memset, but
   // they have limitations for memset pattern size (e.g., 1 byte).
@@ -219,13 +218,37 @@ struct L0RtContext {
   ~L0RtContext() = default;
 };
 
+struct ZeEventDeleter {
+  void operator()(ze_event_handle_t event) const {
+    if (event)
+      L0_SAFE_CALL(zeEventDestroy(event));
+  }
+};
+
+struct ZeEventPoolDeleter {
+  void operator()(ze_event_pool_handle_t pool) const {
+    if (pool)
+      L0_SAFE_CALL(zeEventPoolDestroy(pool));
+  }
+};
+
+using UniqueZeEvent =
+    std::unique_ptr<std::remove_pointer<ze_event_handle_t>::type,
+                    ZeEventDeleter>;
+using UniqueZeEventPool =
+    std::unique_ptr<std::remove_pointer<ze_event_pool_handle_t>::type,
+                    ZeEventPoolDeleter>;
+
 // L0 only supports pre-determined sizes of event pools,
-// implement a rt data struct to avoid running out of events.
+// implement a runtime data structure to avoid running out of events.
+
 struct DynamicEventPool {
   constexpr static size_t numEventsPerPool{128};
-  std::vector<ze_event_pool_handle_t> eventPools;
-  std::vector<ze_event_handle_t> availableEvents;
-  std::unordered_set<ze_event_handle_t> takenEvents;
+
+  std::vector<UniqueZeEventPool> eventPools;
+  std::vector<UniqueZeEvent> availableEvents;
+  std::unordered_map<ze_event_handle_t, UniqueZeEvent> takenEvents;
+
   size_t currentEventsLimit{0};
   size_t currentEventsCnt{0};
   L0RtContext *rtCtx;
@@ -234,51 +257,68 @@ struct DynamicEventPool {
     createNewPool(numEventsPerPool);
   }
 
+  DynamicEventPool(const DynamicEventPool &) = delete;
+  DynamicEventPool &operator=(const DynamicEventPool &) = delete;
+
+  // Allow move
+  DynamicEventPool(DynamicEventPool &&) noexcept = default;
+  DynamicEventPool &operator=(DynamicEventPool &&) noexcept = default;
+
   ~DynamicEventPool() {
-    assert(!takenEvents.size());
-    // zeEventDestroy will trigger L0_SAFE_CALL if an event is still used by
-    // device
-    for (auto event : availableEvents)
-      L0_SAFE_CALL(zeEventDestroy(event));
-    for (auto pool : eventPools)
-      L0_SAFE_CALL(zeEventPoolDestroy(pool));
+    assert(takenEvents.empty() && "Some events were not released");
   }
 
   void createNewPool(size_t numEvents) {
     ze_event_pool_desc_t eventPoolDesc = {};
     eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
     eventPoolDesc.count = numEvents;
-    eventPools.push_back(nullptr);
+
+    ze_event_pool_handle_t rawPool = nullptr;
     L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context.get(), &eventPoolDesc, 1,
-                                   &rtCtx->device, &eventPools.back()));
+                                   &rtCtx->device, &rawPool));
+
+    eventPools.emplace_back(UniqueZeEventPool(rawPool));
     currentEventsLimit += numEvents;
   }
 
   ze_event_handle_t takeEvent() {
-    ze_event_handle_t event{nullptr};
-    if (availableEvents.size()) {
-      event = availableEvents.back();
+    ze_event_handle_t rawEvent = nullptr;
+
+    if (!availableEvents.empty()) {
+      // Reuse one
+      auto uniqueEvent = std::move(availableEvents.back());
       availableEvents.pop_back();
+      rawEvent = uniqueEvent.get();
+      takenEvents[rawEvent] = std::move(uniqueEvent);
     } else {
       if (currentEventsCnt == currentEventsLimit)
         createNewPool(numEventsPerPool);
-      currentEventsCnt++;
+
       ze_event_desc_t eventDesc = {
           ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
           static_cast<uint32_t>(currentEventsCnt % numEventsPerPool),
           ZE_EVENT_SCOPE_FLAG_DEVICE, ZE_EVENT_SCOPE_FLAG_HOST};
-      L0_SAFE_CALL(zeEventCreate(eventPools.back(), &eventDesc, &event));
+
+      ze_event_handle_t newEvent = nullptr;
+      L0_SAFE_CALL(
+          zeEventCreate(eventPools.back().get(), &eventDesc, &newEvent));
+
+      takenEvents[newEvent] = UniqueZeEvent(newEvent);
+      rawEvent = newEvent;
+      currentEventsCnt++;
     }
-    takenEvents.insert(event);
-    return event;
+
+    return rawEvent;
   }
 
   void releaseEvent(ze_event_handle_t event) {
-    auto found = takenEvents.find(event);
-    assert(found != takenEvents.end());
-    takenEvents.erase(found);
+    auto it = takenEvents.find(event);
+    assert(it != takenEvents.end() &&
+           "Attempting to release unknown or already released event");
+
     L0_SAFE_CALL(zeEventHostReset(event));
-    availableEvents.push_back(event);
+    availableEvents.emplace_back(std::move(it->second));
+    takenEvents.erase(it);
   }
 };
 

>From 266135cb9b84d74dd528049faa3ba8d635cb9008 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 19:44:50 +0000
Subject: [PATCH 06/12] Address review comments.

Add L0 error message.
Find levelzero only once if SYCL or LEVELZERO runner is enabled.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt       | 30 ++++++++-----------
 .../LevelZeroRuntimeWrappers.cpp              | 29 +++++++++---------
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index f0d590ce1f85f..169f99e08de71 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -389,6 +389,15 @@ if(LLVM_ENABLE_PIC)
     )
   endif()
 
+  if(MLIR_ENABLE_SYCL_RUNNER OR MLIR_ENABLE_LEVELZERO_RUNNER)
+    # Both runtimes require LevelZero, so we can find it once.
+    find_package(LevelZeroRuntime)
+
+    if(NOT LevelZeroRuntime_FOUND)
+      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+    endif()
+  endif()
+
   if(MLIR_ENABLE_SYCL_RUNNER)
     find_package(SyclRuntime)
 
@@ -396,12 +405,6 @@ if(LLVM_ENABLE_PIC)
       message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
     endif()
 
-    find_package(LevelZeroRuntime)
-
-    if(NOT LevelZeroRuntime_FOUND)
-      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
-    endif()
-
     add_mlir_library(mlir_sycl_runtime
       SHARED
       SyclRuntimeWrappers.cpp
@@ -427,12 +430,6 @@ if(LLVM_ENABLE_PIC)
   endif()
 
   if(MLIR_ENABLE_LEVELZERO_RUNNER)
-    find_package(LevelZeroRuntime)
-
-    if(NOT LevelZeroRuntime_FOUND)
-      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
-    endif()
-
     add_mlir_library(mlir_levelzero_runtime
       SHARED
       LevelZeroRuntimeWrappers.cpp
@@ -440,12 +437,11 @@ if(LLVM_ENABLE_PIC)
       EXCLUDE_FROM_LIBMLIR
     )
 
-    check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
-
-    if(NOT CXX_HAS_FRTTI_FLAG)
-      message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
-    endif()
+    # check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
 
+    # if(NOT CXX_HAS_FRTTI_FLAG)
+    # message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+    # endif()
     target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti)
 
     target_include_directories(mlir_levelzero_runtime PRIVATE
diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 073605bf482c6..1cb6141308a4f 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -23,7 +23,6 @@
 #include <vector>
 
 namespace {
-
 template <typename F>
 auto catchAll(F &&func) {
   try {
@@ -41,7 +40,9 @@ auto catchAll(F &&func) {
   {                                                                            \
     ze_result_t status = (call);                                               \
     if (status != ZE_RESULT_SUCCESS) {                                         \
-      std::cerr << "L0 error " << status << std::endl;                         \
+      const char *errorString;                                                 \
+      zeDriverGetLastErrorDescription(NULL, &errorString);                     \
+      std::cerr << "L0 error " << status << ": " << errorString << std::endl;  \
       std::abort();                                                            \
     }                                                                          \
   }
@@ -78,21 +79,20 @@ static ze_driver_handle_t getDriver(uint32_t idx = 0) {
   return drivers[idx];
 }
 
-static ze_device_handle_t getDefaultDevice(const uint32_t driverIdx = 0,
-                                           const int32_t devIdx = 0) {
+static ze_device_handle_t getDevice(const uint32_t driverIdx = 0,
+                                    const int32_t devIdx = 0) {
   thread_local static ze_device_handle_t l0Device;
-  thread_local static int32_t currDevIdx{-1};
-  thread_local static uint32_t currDriverIdx{0};
+  thread_local int32_t currDevIdx{-1};
+  thread_local uint32_t currDriverIdx{0};
   if (currDriverIdx == driverIdx && currDevIdx == devIdx)
     return l0Device;
   auto driver = getDriver(driverIdx);
   uint32_t deviceCount{0};
   L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr));
   if (!deviceCount)
-    throw std::runtime_error(
-        "getDefaultDevice failed: did not find L0 device.");
+    throw std::runtime_error("getDevice failed: did not find L0 device.");
   if (static_cast<int>(deviceCount) < devIdx + 1)
-    throw std::runtime_error("getDefaultDevice failed: devIdx out-of-bounds.");
+    throw std::runtime_error("getDevice failed: devIdx out-of-bounds.");
   std::vector<ze_device_handle_t> devices(deviceCount);
   L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data()));
   l0Device = devices[devIdx];
@@ -150,8 +150,8 @@ struct L0RtContext {
   uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
 
   L0RtContext() = default;
-  L0RtContext(const int32_t devIdx = 0)
-      : driver(getDriver()), device(getDefaultDevice(devIdx)) {
+  L0RtContext(const uint32_t driverIdx = 0, const int32_t devIdx = 0)
+      : driver(getDriver(driverIdx)), device(getDevice(devIdx)) {
     // Create context
     ze_context_handle_t defaultCtx = getDefaultContext();
     context.reset(defaultCtx);
@@ -488,10 +488,11 @@ extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
 template <typename PATTERN_TYPE>
 void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
                 StreamWrapper *stream) {
+  L0RtContext &rtContext = getRtContext();
   auto listType =
-      getRtContext().copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
-          ? getRtContext().immCmdListCopy.get()
-          : getRtContext().immCmdListCompute.get();
+      rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
+          ? rtContext.immCmdListCopy.get()
+          : rtContext.immCmdListCompute.get();
   stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
                         ze_event_handle_t *waitEvents) {
     L0_SAFE_CALL(zeCommandListAppendMemoryFill(

>From 65aea78e596271d7c226344fa21810a1ca095f6a Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 20:07:04 +0000
Subject: [PATCH 07/12] Address review comments.

Add linebreak for large lines.
Rename test directory to 'LevelZero'.
---
 .../gpu-addf32-to-spirv.mlir                  | 11 +++++-----
 .../gpu-addi64-to-spirv.mlir                  | 11 +++++-----
 .../gpu-memcpy-addf32-to-spirv.mlir           | 11 +++++-----
 .../gpu-reluf32-to-spirv.mlir                 | 22 ++++++++++---------
 .../{LEVELZERO => LevelZero}/lit.local.cfg    |  0
 5 files changed, 30 insertions(+), 25 deletions(-)
 rename mlir/test/Integration/GPU/{LEVELZERO => LevelZero}/gpu-addf32-to-spirv.mlir (84%)
 rename mlir/test/Integration/GPU/{LEVELZERO => LevelZero}/gpu-addi64-to-spirv.mlir (84%)
 rename mlir/test/Integration/GPU/{LEVELZERO => LevelZero}/gpu-memcpy-addf32-to-spirv.mlir (84%)
 rename mlir/test/Integration/GPU/{LEVELZERO => LevelZero}/gpu-reluf32-to-spirv.mlir (77%)
 rename mlir/test/Integration/GPU/{LEVELZERO => LevelZero}/lit.local.cfg (100%)

diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
similarity index 84%
rename from mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
rename to mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
index 82cefb4e3279b..7e66dee0272f6 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
@@ -26,9 +26,8 @@ module @add attributes {gpu.container_module} {
     memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
     %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
     %2 = gpu.wait async
-
-    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
-
+    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+          args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
     gpu.wait [%3]
     %alloc = memref.alloc() : memref<2x2x2xf32>
     memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
@@ -39,8 +38,10 @@ module @add attributes {gpu.container_module} {
     gpu.wait [%7]
     return %alloc : memref<2x2x2xf32>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+  gpu.module @test_kernel
+  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
+    attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
       %2 = gpu.block_id  z
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
similarity index 84%
rename from mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
rename to mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
index 61400874b3716..df8fbe4d86d9c 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
@@ -26,9 +26,8 @@ module @add attributes {gpu.container_module} {
     memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
     %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
     %2 = gpu.wait async
-
-    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
-
+    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1)
+          args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
     gpu.wait [%3]
     %alloc = memref.alloc() : memref<3x3xi64>
     memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
@@ -39,8 +38,10 @@ module @add attributes {gpu.container_module} {
     gpu.wait [%7]
     return %alloc : memref<3x3xi64>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+  gpu.module @test_kernel
+  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel
+    attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
       %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir
similarity index 84%
rename from mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
rename to mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir
index 3531d14a98d16..cd99f2c70dc6e 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir
@@ -26,9 +26,8 @@ module @add attributes {gpu.container_module} {
     %memref_0 = gpu.alloc () : memref<2x2x2xf32>
     gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
     %memref_1 = gpu.alloc () : memref<2x2x2xf32>
-
-    gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
-
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+      args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
     %alloc = memref.alloc() : memref<2x2x2xf32>
     gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
     gpu.dealloc %memref_1 : memref<2x2x2xf32>
@@ -36,8 +35,10 @@ module @add attributes {gpu.container_module} {
     gpu.dealloc %memref : memref<2x2x2xf32>
     return %alloc : memref<2x2x2xf32>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+  gpu.module @test_kernel
+  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
+    attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
       %2 = gpu.block_id  z
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir
similarity index 77%
rename from mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
rename to mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir
index f935a04a8c7a3..8d022ac1cf277 100644
--- a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir
@@ -41,15 +41,13 @@ module @relu attributes {gpu.container_module} {
     memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
     %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
     %2 = gpu.wait async
-
-    %3 = gpu.launch_func async [%2]  @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
-
+    %3 = gpu.launch_func async [%2]  @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
+          args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
     gpu.wait [%3]
     %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
     %4 = gpu.wait async
-
-    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32,
-
+    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
+          args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32,
     %memref_1 : memref<4x5xf32>)
     gpu.wait [%5]
     %alloc = memref.alloc() : memref<4x5xf32>
@@ -60,8 +58,10 @@ module @relu attributes {gpu.container_module} {
     %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
     return %alloc : memref<4x5xf32>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+  gpu.module @test_kernel
+  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel
+    attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
       %2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
@@ -70,8 +70,10 @@ module @relu attributes {gpu.container_module} {
       gpu.return
     }
   }
-  gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+  gpu.module @test_kernel_0
+  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel
+    attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
       %2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
diff --git a/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg
similarity index 100%
rename from mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
rename to mlir/test/Integration/GPU/LevelZero/lit.local.cfg

>From ac6f56bba15b0130b39a89538b4f80393976e413 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 20:18:03 +0000
Subject: [PATCH 08/12] Address review comments.

Revert formatting added to Cmake File.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt | 77 ++++++++++---------------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 169f99e08de71..2212bfbb34f57 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -20,7 +20,7 @@ set(LLVM_OPTIONAL_SOURCES
   VulkanRuntimeWrappers.cpp
   VulkanRuntime.cpp
   VulkanRuntime.h
-)
+  )
 
 # Use a separate library for OptUtils, to avoid pulling in the entire JIT and
 # codegen infrastructure. Unlike MLIRExecutionEngine, this is part of
@@ -46,7 +46,7 @@ add_mlir_library(MLIRExecutionEngineUtils
   IPO
   Passes
   TargetParser
-)
+  )
 
 if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
   return()
@@ -54,12 +54,12 @@ endif()
 
 if(LLVM_USE_INTEL_JITEVENTS)
   set(LLVM_JIT_LISTENER_LIB
-    IntelJITEvents)
+      IntelJITEvents)
 endif(LLVM_USE_INTEL_JITEVENTS)
 
 if(LLVM_USE_PERF)
   set(LLVM_JIT_LISTENER_LIB
-    PerfJITEvents)
+      PerfJITEvents)
 endif(LLVM_USE_PERF)
 
 add_mlir_library(MLIRExecutionEngine
@@ -92,7 +92,7 @@ add_mlir_library(MLIRExecutionEngine
   IPO
   Passes
   ${LLVM_JIT_LISTENER_LIB}
-)
+  )
 
 mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
   MLIRBuiltinToLLVMIRTranslation
@@ -101,9 +101,9 @@ mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
   MLIRLLVMToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
   MLIRTargetLLVMIRExport
-)
+  )
 
-if(LLVM_BUILD_LLVM_DYLIB AND NOT(WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
+if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
   # Build a shared library for the execution engine. Some downstream projects
   # use this library to build their own CPU runners while preserving dynamic
   # linkage.
@@ -123,7 +123,7 @@ if(LLVM_BUILD_LLVM_DYLIB AND NOT(WIN32 OR MINGW OR CYGWIN)) # Does not build on
     LINK_LIBS PUBLIC
     LLVM
     MLIR
-  )
+    )
 endif()
 
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
@@ -163,7 +163,7 @@ if(LLVM_ENABLE_PIC)
     Float16bits.cpp
 
     EXCLUDE_FROM_LIBMLIR
-  )
+    )
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
@@ -180,7 +180,7 @@ if(LLVM_ENABLE_PIC)
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
-  )
+    )
   set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS)
 
@@ -206,7 +206,6 @@ if(LLVM_ENABLE_PIC)
   )
   set_property(TARGET mlir_async_runtime PROPERTY CXX_VISIBILITY_PRESET hidden)
   target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS)
-
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
     # Don't export symbols from link-time dependencies, these are internal
     # implementation details.
@@ -228,8 +227,7 @@ if(LLVM_ENABLE_PIC)
     # custom error message.
     include(CheckLanguage)
     check_language(CUDA)
-
-    if(CMAKE_CUDA_COMPILER)
+    if (CMAKE_CUDA_COMPILER)
       enable_language(CUDA)
     else()
       message(SEND_ERROR
@@ -293,14 +291,13 @@ if(LLVM_ENABLE_PIC)
 
   if(MLIR_ENABLE_ROCM_RUNNER)
     # Configure ROCm support.
-    if(NOT DEFINED ROCM_PATH)
-      if(NOT DEFINED ENV{ROCM_PATH})
+    if (NOT DEFINED ROCM_PATH)
+      if (NOT DEFINED ENV{ROCM_PATH})
         set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
       else()
         set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
       endif()
     endif()
-
     # A lot of the ROCm CMake files expect to find their own dependencies in
     # CMAKE_PREFIX_PATH and don't respect PATHS or HINTS :( .
     # Therefore, temporarily add the ROCm path to CMAKE_PREFIX_PATH so we can
@@ -310,28 +307,24 @@ if(LLVM_ENABLE_PIC)
     find_package(hip REQUIRED)
     set(CMAKE_PREFIX_PATH "${REAL_CMAKE_PREFIX_PATH}")
 
-    if(NOT DEFINED ROCM_TEST_CHIPSET)
+    if (NOT DEFINED ROCM_TEST_CHIPSET)
       find_program(ROCM_AGENT_ENUMERATOR rocm_agent_enumerator "${ROCM_PATH}/bin" /usr/bin /usr/local/bin)
-
       if(ROCM_AGENT_ENUMERATOR)
-        execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
+          execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
           OUTPUT_VARIABLE AGENTS_STRING
           ERROR_VARIABLE AGENTS_STRING
           RESULT_VARIABLE AGENT_ENUMERATOR_RESULT)
       else()
         message(SEND_ERROR "Could not find rocm_agent_enumerator")
       endif()
-
-      if(NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
+      if (NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
         message(SEND_ERROR "Could not run rocm_agent_enumerator and ROCM_TEST_CHIPSET is not defined")
         set(AGENTS_STRING "")
       endif()
-
       string(STRIP AGENTS_STRING ${AGENTS_STRING})
       string(REPLACE "\n" ";" AGENTS_LIST ${AGENTS_STRING})
       list(FILTER AGENTS_LIST EXCLUDE REGEX "gfx000")
-
-      if(AGENTS_LIST STREQUAL "")
+      if (AGENTS_LIST STREQUAL "")
         message(SEND_ERROR "No non-CPU ROCm agents found on the system, and ROCM_TEST_CHIPSET is not defined")
       else()
         list(GET AGENTS_LIST 0 FIRST_AGENT)
@@ -350,34 +343,27 @@ if(LLVM_ENABLE_PIC)
     # Supress compiler warnings from HIP headers
     check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi
       CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG)
-
-    if(CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
+    if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-c++98-compat-extra-semi")
     endif()
-
     check_cxx_compiler_flag(-Wno-return-type-c-linkage
-      CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
-
-    if(CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+        CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+    if (CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-return-type-c-linkage")
     endif()
-
     check_cxx_compiler_flag(-Wno-nested-anon-types
       CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
-
-    if(CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
+    if (CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-nested-anon-types")
     endif()
-
     check_cxx_compiler_flag(-Wno-gnu-anonymous-struct
       CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
-
-    if(CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+    if (CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
-        "-Wno-gnu-anonymous-struct")
+      "-Wno-gnu-anonymous-struct")
     endif()
 
     set_property(TARGET mlir_rocm_runtime
@@ -413,12 +399,10 @@ if(LLVM_ENABLE_PIC)
     )
 
     check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
-
     if(NOT CXX_HAS_FRTTI_FLAG)
       message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
     endif()
-
-    target_compile_options(mlir_sycl_runtime PUBLIC -fexceptions -frtti)
+    target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
 
     target_include_directories(mlir_sycl_runtime PRIVATE
       ${MLIR_INCLUDE_DIRS}
@@ -466,26 +450,25 @@ if(LLVM_ENABLE_PIC)
       mlir_spirv_cpu_runtime_EXPORTS)
   endif()
 
-  if(MLIR_ENABLE_VULKAN_RUNNER)
+  if (MLIR_ENABLE_VULKAN_RUNNER)
     find_package(Vulkan)
 
     # If Vulkan is not found try a path specified by VULKAN_SDK.
-    if(NOT Vulkan_FOUND)
-      if("$ENV{VULKAN_SDK}" STREQUAL "")
+    if (NOT Vulkan_FOUND)
+      if ("$ENV{VULKAN_SDK}" STREQUAL "")
         message(FATAL_ERROR "Vulkan not found through CMake; please provide "
-          "VULKAN_SDK path as an environment variable")
+                            "VULKAN_SDK path as an environment variable")
       endif()
 
       find_library(Vulkan_LIBRARY vulkan HINTS "$ENV{VULKAN_SDK}/lib" REQUIRED)
-
-      if(Vulkan_LIBRARY)
+      if (Vulkan_LIBRARY)
         set(Vulkan_FOUND ON)
         set(Vulkan_INCLUDE_DIR "$ENV{VULKAN_SDK}/include")
         message(STATUS "Found Vulkan: " ${Vulkan_LIBRARY})
       endif()
     endif()
 
-    if(NOT Vulkan_FOUND)
+    if (NOT Vulkan_FOUND)
       message(FATAL_ERROR "Cannot find Vulkan library")
     endif()
 

>From dce11e3cc94cf6c5a10b5b88e7a447c96c917eb4 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Wed, 30 Jul 2025 20:20:18 +0000
Subject: [PATCH 09/12] Fix end-of-line issue.

---
 mlir/lib/ExecutionEngine/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 2212bfbb34f57..18c2376f8b3d4 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -491,4 +491,4 @@ if(LLVM_ENABLE_PIC)
       ${Vulkan_LIBRARY}
     )
   endif()
-endif()
\ No newline at end of file
+endif()

>From 71a4609d51d433d0ab08ec31e77df7aa0115cc9a Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Mon, 4 Aug 2025 21:32:43 +0000
Subject: [PATCH 10/12] Address review comments.

Limit the maximum no. of events.
Modify getContext to be a bit more generic.
---
 .../LevelZeroRuntimeWrappers.cpp              | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 1cb6141308a4f..21eaf28c9f214 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -102,13 +102,12 @@ static ze_device_handle_t getDevice(const uint32_t driverIdx = 0,
 }
 
 // Returns the default L0 context of the defult driver.
-static ze_context_handle_t getDefaultContext() {
+static ze_context_handle_t getContext(ze_driver_handle_t driver) {
   thread_local static ze_context_handle_t context;
   thread_local static bool isContextInitialised{false};
   if (isContextInitialised)
     return context;
   ze_context_desc_t ctxtDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
-  auto driver = getDriver();
   L0_SAFE_CALL(zeContextCreate(driver, &ctxtDesc, &context));
   isContextInitialised = true;
   return context;
@@ -137,7 +136,7 @@ using UniqueZeContext =
 using UniqueZeCommandList =
     std::unique_ptr<std::remove_pointer<ze_command_list_handle_t>::type,
                     ZeCommandListDeleter>;
-struct L0RtContext {
+struct L0RTContextWrapper {
   ze_driver_handle_t driver{nullptr};
   ze_device_handle_t device{nullptr};
   UniqueZeContext context;
@@ -149,12 +148,12 @@ struct L0RtContext {
   UniqueZeCommandList immCmdListCopy;
   uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
 
-  L0RtContext() = default;
-  L0RtContext(const uint32_t driverIdx = 0, const int32_t devIdx = 0)
+  L0RTContextWrapper() = default;
+  L0RTContextWrapper(const uint32_t driverIdx = 0, const int32_t devIdx = 0)
       : driver(getDriver(driverIdx)), device(getDevice(devIdx)) {
     // Create context
-    ze_context_handle_t defaultCtx = getDefaultContext();
-    context.reset(defaultCtx);
+    ze_context_handle_t ctx = getContext(driver);
+    context.reset(ctx);
 
     // Determine ordinals
     uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u;
@@ -210,12 +209,12 @@ struct L0RtContext {
         context.get(), device, &cmdQueueDesc, &rawCmdListCompute));
     immCmdListCompute.reset(rawCmdListCompute);
   }
-  L0RtContext(const L0RtContext &) = delete;
-  L0RtContext &operator=(const L0RtContext &) = delete;
+  L0RTContextWrapper(const L0RTContextWrapper &) = delete;
+  L0RTContextWrapper &operator=(const L0RTContextWrapper &) = delete;
   // Allow move
-  L0RtContext(L0RtContext &&) noexcept = default;
-  L0RtContext &operator=(L0RtContext &&) noexcept = default;
-  ~L0RtContext() = default;
+  L0RTContextWrapper(L0RTContextWrapper &&) noexcept = default;
+  L0RTContextWrapper &operator=(L0RTContextWrapper &&) noexcept = default;
+  ~L0RTContextWrapper() = default;
 };
 
 struct ZeEventDeleter {
@@ -249,11 +248,15 @@ struct DynamicEventPool {
   std::vector<UniqueZeEvent> availableEvents;
   std::unordered_map<ze_event_handle_t, UniqueZeEvent> takenEvents;
 
+  // Limit the number of events to avoid running out of memory.
+  // The limit is set to 32K events, which should be sufficient for most use
+  // cases.
+  size_t maxEventsCount{32768}; // 32K events
   size_t currentEventsLimit{0};
   size_t currentEventsCnt{0};
-  L0RtContext *rtCtx;
+  L0RTContextWrapper *rtCtx;
 
-  DynamicEventPool(L0RtContext *rtCtx) : rtCtx(rtCtx) {
+  DynamicEventPool(L0RTContextWrapper *rtCtx) : rtCtx(rtCtx) {
     createNewPool(numEventsPerPool);
   }
 
@@ -291,6 +294,9 @@ struct DynamicEventPool {
       rawEvent = uniqueEvent.get();
       takenEvents[rawEvent] = std::move(uniqueEvent);
     } else {
+      if (currentEventsCnt >= maxEventsCount) {
+        throw std::runtime_error("DynamicEventPool: reached max events limit");
+      }
       if (currentEventsCnt == currentEventsLimit)
         createNewPool(numEventsPerPool);
 
@@ -322,8 +328,8 @@ struct DynamicEventPool {
   }
 };
 
-L0RtContext &getRtContext() {
-  thread_local static L0RtContext rtContext(0);
+L0RTContextWrapper &getRtContext() {
+  thread_local static L0RTContextWrapper rtContext(0);
   return rtContext;
 }
 
@@ -488,7 +494,7 @@ extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
 template <typename PATTERN_TYPE>
 void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
                 StreamWrapper *stream) {
-  L0RtContext &rtContext = getRtContext();
+  L0RTContextWrapper &rtContext = getRtContext();
   auto listType =
       rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
           ? rtContext.immCmdListCopy.get()
@@ -561,7 +567,7 @@ extern "C" void mgpuSetDefaultDevice(int32_t devIdx) {
   catchAll([&]() {
     // For now, a user must ensure that streams and events complete
     // and are destroyed before switching a device.
-    getRtContext() = L0RtContext(devIdx);
+    getRtContext() = L0RTContextWrapper(devIdx);
     getDynamicEventPool() = DynamicEventPool(&getRtContext());
   });
 }

>From aad2d6e751973a120fc47abde25f4e6ca795f7c9 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Tue, 5 Aug 2025 17:25:25 +0000
Subject: [PATCH 11/12] Address review comments.

Remove unused code from CMake file.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 18c2376f8b3d4..fdeb4dacf9278 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -421,11 +421,6 @@ if(LLVM_ENABLE_PIC)
       EXCLUDE_FROM_LIBMLIR
     )
 
-    # check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
-
-    # if(NOT CXX_HAS_FRTTI_FLAG)
-    # message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
-    # endif()
     target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti)
 
     target_include_directories(mlir_levelzero_runtime PRIVATE

>From 77dbdd63a22d97b3b05673215afc80cfddf6010f Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Tue, 5 Aug 2025 17:32:49 +0000
Subject: [PATCH 12/12] Remove new line changes in Cmake File.

---
 mlir/cmake/modules/FindLevelZeroRuntime.cmake | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mlir/cmake/modules/FindLevelZeroRuntime.cmake b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
index b1e8e5b6387f2..2a8fb3a16d16f 100644
--- a/mlir/cmake/modules/FindLevelZeroRuntime.cmake
+++ b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
@@ -74,22 +74,17 @@ function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
     # If they differ in size pad the shorter list with 0s
     if(VL1_LEN GREATER VL2_LEN)
         math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
-
         foreach(IDX RANGE 1 ${DIFF} 1)
             list(APPEND VL2 "0")
         endforeach()
     elseif(VL2_LEN GREATER VL2_LEN)
         math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
-
         foreach(IDX RANGE 1 ${DIFF} 1)
             list(APPEND VL2 "0")
         endforeach()
-
         set(LEN ${VL2_LEN})
     endif()
-
     math(EXPR LEN_SUB_ONE "${LEN}-1")
-
     foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
         list(GET VL1 ${IDX} VAL1)
         list(GET VL2 ${IDX} VAL2)