[Mlir-commits] [mlir] [mlir][ExecutionEngine] Add LevelZeroRuntimeWrapper. (PR #151038)
Md Abdullah Shahneous Bari
llvmlistbot at llvm.org
Mon Jul 28 13:48:38 PDT 2025
https://github.com/mshahneo created https://github.com/llvm/llvm-project/pull/151038
Adds LevelZeroRuntime wrapper and tests.
>From 675499a138a77a964348123fb78b4c6068ee9ae5 Mon Sep 17 00:00:00 2001
From: "Shahneous Bari, Md Abdullah" <md.abdullah.shahneous.bari at intel.com>
Date: Mon, 28 Jul 2025 20:29:55 +0000
Subject: [PATCH] [mlir][ExecutionEngine] Add LevelZeroRuntimeWrapper.
Adds LevelZeroRuntime wrapper and tests.
Co-authored-by: Artem Kroviakov <artem.kroviakov at intel.com>
Co-authored-by: Nishant Patel <nishant.b.patel at intel.com>
---
...lZero.cmake => FindLevelZeroRuntime.cmake} | 113 ++--
mlir/lib/ExecutionEngine/CMakeLists.txt | 117 +++--
.../LevelZeroRuntimeWrappers.cpp | 491 ++++++++++++++++++
.../GPU/LEVELZERO/gpu-addf32-to-spirv.mlir | 56 ++
.../GPU/LEVELZERO/gpu-addi64-to-spirv.mlir | 54 ++
.../LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir | 53 ++
.../GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir | 79 +++
.../Integration/GPU/LEVELZERO/lit.local.cfg | 2 +
8 files changed, 878 insertions(+), 87 deletions(-)
rename mlir/cmake/modules/{FindLevelZero.cmake => FindLevelZeroRuntime.cmake} (66%)
create mode 100644 mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
create mode 100644 mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
create mode 100644 mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
similarity index 66%
rename from mlir/cmake/modules/FindLevelZero.cmake
rename to mlir/cmake/modules/FindLevelZeroRuntime.cmake
index 012187f0afc0b..b1e8e5b6387f2 100644
--- a/mlir/cmake/modules/FindLevelZero.cmake
+++ b/mlir/cmake/modules/FindLevelZeroRuntime.cmake
@@ -20,7 +20,6 @@ include(FindPackageHandleStandardArgs)
# Search path priority
# 1. CMake Variable LEVEL_ZERO_DIR
# 2. Environment Variable LEVEL_ZERO_DIR
-
if(NOT LEVEL_ZERO_DIR)
if(DEFINED ENV{LEVEL_ZERO_DIR})
set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
@@ -28,32 +27,32 @@ if(NOT LEVEL_ZERO_DIR)
endif()
if(LEVEL_ZERO_DIR)
- find_path(LevelZero_INCLUDE_DIR
+ find_path(LevelZeroRuntime_INCLUDE_DIR
NAMES level_zero/ze_api.h
PATHS ${LEVEL_ZERO_DIR}/include
NO_DEFAULT_PATH
)
if(LINUX)
- find_library(LevelZero_LIBRARY
+ find_library(LevelZeroRuntime_LIBRARY
NAMES ze_loader
PATHS ${LEVEL_ZERO_DIR}/lib
- ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+ ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
NO_DEFAULT_PATH
)
else()
- find_library(LevelZero_LIBRARY
+ find_library(LevelZeroRuntime_LIBRARY
NAMES ze_loader
PATHS ${LEVEL_ZERO_DIR}/lib
NO_DEFAULT_PATH
)
endif()
else()
- find_path(LevelZero_INCLUDE_DIR
+ find_path(LevelZeroRuntime_INCLUDE_DIR
NAMES level_zero/ze_api.h
)
- find_library(LevelZero_LIBRARY
+ find_library(LevelZeroRuntime_LIBRARY
NAMES ze_loader
)
endif()
@@ -64,26 +63,33 @@ endif()
# lists of equal lengths, with the shorter string getting zero-padded.
function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
# Convert the strings to list
- string(REPLACE "." ";" VL1 ${VERSION_STR1})
- string(REPLACE "." ";" VL2 ${VERSION_STR2})
+ string(REPLACE "." ";" VL1 ${VERSION_STR1})
+ string(REPLACE "." ";" VL2 ${VERSION_STR2})
+
# get lengths of both lists
list(LENGTH VL1 VL1_LEN)
list(LENGTH VL2 VL2_LEN)
set(LEN ${VL1_LEN})
+
# If they differ in size pad the shorter list with 0s
if(VL1_LEN GREATER VL2_LEN)
math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+
foreach(IDX RANGE 1 ${DIFF} 1)
list(APPEND VL2 "0")
endforeach()
elseif(VL2_LEN GREATER VL2_LEN)
math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+
foreach(IDX RANGE 1 ${DIFF} 1)
list(APPEND VL2 "0")
endforeach()
+
set(LEN ${VL2_LEN})
endif()
+
math(EXPR LEN_SUB_ONE "${LEN}-1")
+
foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
list(GET VL1 ${IDX} VAL1)
list(GET VL2 ${IDX} VAL2)
@@ -98,12 +104,10 @@ function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
set(${OUTPUT} TRUE PARENT_SCOPE)
endif()
endforeach()
-
- endfunction(compare_versions)
+endfunction(compare_versions)
# Creates a small function to run and extract the LevelZero loader version.
function(get_l0_loader_version)
-
set(L0_VERSIONEER_SRC
[====[
#include <iostream>
@@ -142,19 +146,20 @@ function(get_l0_loader_version)
# We need both the directories in the include path as ze_loader.h
# includes "ze_api.h" and not "level_zero/ze_api.h".
- list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
- list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+ list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
+ list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}/level_zero)
list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
- "${CMAKE_BINARY_DIR}"
- "${L0_VERSIONEER_FILE}"
- LINK_LIBRARIES ${LevelZero_LIBRARY}
- CMAKE_FLAGS
- "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
- RUN_OUTPUT_VARIABLE L0_VERSION
+ "${CMAKE_BINARY_DIR}"
+ "${L0_VERSIONEER_FILE}"
+ LINK_LIBRARIES ${LevelZeroRuntime_LIBRARY}
+ CMAKE_FLAGS
+ "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+ RUN_OUTPUT_VARIABLE L0_VERSION
)
- if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
- set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+
+ if(${L0_VERSIONEER_COMPILE} AND(DEFINED L0_VERSIONEER_RUN))
+ set(LevelZeroRuntime_VERSION ${L0_VERSION} PARENT_SCOPE)
message(STATUS "Found Level Zero of version: ${L0_VERSION}")
else()
message(FATAL_ERROR
@@ -163,59 +168,61 @@ function(get_l0_loader_version)
endif()
endfunction(get_l0_loader_version)
-if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
- list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
- list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+if(LevelZeroRuntime_INCLUDE_DIR AND LevelZeroRuntime_LIBRARY)
+ list(APPEND LevelZeroRuntime_LIBRARIES "${LevelZeroRuntime_LIBRARY}")
+ list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
+
if(OpenCL_FOUND)
- list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+ list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
endif()
- cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
- set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
-
- if(NOT TARGET LevelZero::LevelZero)
- add_library(LevelZero::LevelZero INTERFACE IMPORTED)
- set_target_properties(LevelZero::LevelZero
- PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
- )
- set_target_properties(LevelZero::LevelZero
- PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
- )
+ cmake_path(GET LevelZeroRuntime_LIBRARY PARENT_PATH LevelZeroRuntime_LIBRARIES_PATH)
+ set(LevelZeroRuntime_LIBRARIES_DIR ${LevelZeroRuntime_LIBRARIES_PATH})
+
+ if(NOT TARGET LevelZeroRuntime::LevelZeroRuntime)
+ add_library(LevelZeroRuntime::LevelZeroRuntime INTERFACE IMPORTED)
+ set_target_properties(LevelZeroRuntime::LevelZeroRuntime
+ PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZeroRuntime_LIBRARIES}"
+ )
+ set_target_properties(LevelZeroRuntime::LevelZeroRuntime
+ PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZeroRuntime_INCLUDE_DIRS}"
+ )
endif()
endif()
# Check if a specific version of Level Zero is required
-if(LevelZero_FIND_VERSION)
+if(LevelZeroRuntime_FIND_VERSION)
get_l0_loader_version()
set(VERSION_GT_FIND_VERSION FALSE)
compare_versions(
- ${LevelZero_VERSION}
- ${LevelZero_FIND_VERSION}
+ ${LevelZeroRuntime_VERSION}
+ ${LevelZeroRuntime_FIND_VERSION}
VERSION_GT_FIND_VERSION
)
+
if(${VERSION_GT_FIND_VERSION})
- set(LevelZero_FOUND TRUE)
+ set(LevelZeroRuntime_FOUND TRUE)
else()
- set(LevelZero_FOUND FALSE)
+ set(LevelZeroRuntime_FOUND FALSE)
endif()
else()
- set(LevelZero_FOUND TRUE)
+ set(LevelZeroRuntime_FOUND TRUE)
endif()
-find_package_handle_standard_args(LevelZero
+find_package_handle_standard_args(LevelZeroRuntime
REQUIRED_VARS
- LevelZero_FOUND
- LevelZero_INCLUDE_DIRS
- LevelZero_LIBRARY
- LevelZero_LIBRARIES_DIR
+ LevelZeroRuntime_FOUND
+ LevelZeroRuntime_INCLUDE_DIRS
+ LevelZeroRuntime_LIBRARY
+ LevelZeroRuntime_LIBRARIES_DIR
HANDLE_COMPONENTS
)
-mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+mark_as_advanced(LevelZeroRuntime_LIBRARY LevelZeroRuntime_INCLUDE_DIRS)
-if(LevelZero_FOUND)
- find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
- "(found version ${LevelZero_VERSION})"
+if(LevelZeroRuntime_FOUND)
+ find_package_message(LevelZeroRuntime "Found LevelZero: ${LevelZeroRuntime_LIBRARY}"
+ "(found version ${LevelZeroRuntime_VERSION})"
)
else()
- find_package_message(LevelZero "Could not find LevelZero" "")
+ find_package_message(LevelZeroRuntime "Could not find LevelZero" "")
endif()
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index dd2ac75b88798..06c879f082926 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -14,12 +14,13 @@ set(LLVM_OPTIONAL_SOURCES
RunnerUtils.cpp
OptUtils.cpp
JitRunner.cpp
+ LevelZeroRuntimeWrappers.cpp
SpirvCpuRuntimeWrappers.cpp
SyclRuntimeWrappers.cpp
VulkanRuntimeWrappers.cpp
VulkanRuntime.cpp
VulkanRuntime.h
- )
+)
# Use a separate library for OptUtils, to avoid pulling in the entire JIT and
# codegen infrastructure. Unlike MLIRExecutionEngine, this is part of
@@ -45,7 +46,7 @@ add_mlir_library(MLIRExecutionEngineUtils
IPO
Passes
TargetParser
- )
+)
if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
return()
@@ -53,12 +54,12 @@ endif()
if(LLVM_USE_INTEL_JITEVENTS)
set(LLVM_JIT_LISTENER_LIB
- IntelJITEvents)
+ IntelJITEvents)
endif(LLVM_USE_INTEL_JITEVENTS)
if(LLVM_USE_PERF)
set(LLVM_JIT_LISTENER_LIB
- PerfJITEvents)
+ PerfJITEvents)
endif(LLVM_USE_PERF)
add_mlir_library(MLIRExecutionEngine
@@ -91,7 +92,7 @@ add_mlir_library(MLIRExecutionEngine
IPO
Passes
${LLVM_JIT_LISTENER_LIB}
- )
+)
mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
MLIRBuiltinToLLVMIRTranslation
@@ -100,9 +101,9 @@ mlir_target_link_libraries(MLIRExecutionEngine PUBLIC
MLIRLLVMToLLVMIRTranslation
MLIROpenMPToLLVMIRTranslation
MLIRTargetLLVMIRExport
- )
+)
-if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
+if(LLVM_BUILD_LLVM_DYLIB AND NOT(WIN32 OR MINGW OR CYGWIN)) # Does not build on windows currently, see #106859
# Build a shared library for the execution engine. Some downstream projects
# use this library to build their own CPU runners while preserving dynamic
# linkage.
@@ -122,7 +123,7 @@ if(LLVM_BUILD_LLVM_DYLIB AND NOT (WIN32 OR MINGW OR CYGWIN)) # Does not build on
LINK_LIBS PUBLIC
LLVM
MLIR
- )
+ )
endif()
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
@@ -162,7 +163,7 @@ if(LLVM_ENABLE_PIC)
Float16bits.cpp
EXCLUDE_FROM_LIBMLIR
- )
+ )
set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
@@ -179,7 +180,7 @@ if(LLVM_ENABLE_PIC)
mlir_float16_utils
MLIRSparseTensorEnums
MLIRSparseTensorRuntime
- )
+ )
set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 17)
target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS)
@@ -205,6 +206,7 @@ if(LLVM_ENABLE_PIC)
)
set_property(TARGET mlir_async_runtime PROPERTY CXX_VISIBILITY_PRESET hidden)
target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS)
+
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Don't export symbols from link-time dependencies, these are internal
# implementation details.
@@ -226,7 +228,8 @@ if(LLVM_ENABLE_PIC)
# custom error message.
include(CheckLanguage)
check_language(CUDA)
- if (CMAKE_CUDA_COMPILER)
+
+ if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
else()
message(SEND_ERROR
@@ -290,13 +293,14 @@ if(LLVM_ENABLE_PIC)
if(MLIR_ENABLE_ROCM_RUNNER)
# Configure ROCm support.
- if (NOT DEFINED ROCM_PATH)
- if (NOT DEFINED ENV{ROCM_PATH})
+ if(NOT DEFINED ROCM_PATH)
+ if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
endif()
endif()
+
# A lot of the ROCm CMake files expect to find their own dependencies in
# CMAKE_PREFIX_PATH and don't respect PATHS or HINTS :( .
# Therefore, temporarily add the ROCm path to CMAKE_PREFIX_PATH so we can
@@ -306,24 +310,28 @@ if(LLVM_ENABLE_PIC)
find_package(hip REQUIRED)
set(CMAKE_PREFIX_PATH "${REAL_CMAKE_PREFIX_PATH}")
- if (NOT DEFINED ROCM_TEST_CHIPSET)
+ if(NOT DEFINED ROCM_TEST_CHIPSET)
find_program(ROCM_AGENT_ENUMERATOR rocm_agent_enumerator "${ROCM_PATH}/bin" /usr/bin /usr/local/bin)
+
if(ROCM_AGENT_ENUMERATOR)
- execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
+ execute_process(COMMAND "${ROCM_AGENT_ENUMERATOR}"
OUTPUT_VARIABLE AGENTS_STRING
ERROR_VARIABLE AGENTS_STRING
RESULT_VARIABLE AGENT_ENUMERATOR_RESULT)
else()
message(SEND_ERROR "Could not find rocm_agent_enumerator")
endif()
- if (NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
+
+ if(NOT AGENT_ENUMERATOR_RESULT EQUAL 0)
message(SEND_ERROR "Could not run rocm_agent_enumerator and ROCM_TEST_CHIPSET is not defined")
set(AGENTS_STRING "")
endif()
+
string(STRIP AGENTS_STRING ${AGENTS_STRING})
string(REPLACE "\n" ";" AGENTS_LIST ${AGENTS_STRING})
list(FILTER AGENTS_LIST EXCLUDE REGEX "gfx000")
- if (AGENTS_LIST STREQUAL "")
+
+ if(AGENTS_LIST STREQUAL "")
message(SEND_ERROR "No non-CPU ROCm agents found on the system, and ROCM_TEST_CHIPSET is not defined")
else()
list(GET AGENTS_LIST 0 FIRST_AGENT)
@@ -342,27 +350,34 @@ if(LLVM_ENABLE_PIC)
# Supress compiler warnings from HIP headers
check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi
CXX_SUPPORTS_NO_CXX98_COMPAT_EXTRA_SEMI_FLAG)
- if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
+
+ if(CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
target_compile_options(mlir_rocm_runtime PRIVATE
"-Wno-c++98-compat-extra-semi")
endif()
+
check_cxx_compiler_flag(-Wno-return-type-c-linkage
- CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
- if (CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+ CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
+
+ if(CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
target_compile_options(mlir_rocm_runtime PRIVATE
"-Wno-return-type-c-linkage")
endif()
+
check_cxx_compiler_flag(-Wno-nested-anon-types
CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
- if (CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
+
+ if(CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
target_compile_options(mlir_rocm_runtime PRIVATE
"-Wno-nested-anon-types")
endif()
+
check_cxx_compiler_flag(-Wno-gnu-anonymous-struct
CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
- if (CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+
+ if(CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
target_compile_options(mlir_rocm_runtime PRIVATE
- "-Wno-gnu-anonymous-struct")
+ "-Wno-gnu-anonymous-struct")
endif()
set_property(TARGET mlir_rocm_runtime
@@ -381,9 +396,9 @@ if(LLVM_ENABLE_PIC)
message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
endif()
- find_package(LevelZero)
+ find_package(LevelZeroRuntime)
- if(NOT LevelZero_FOUND)
+ if(NOT LevelZeroRuntime_FOUND)
message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
endif()
@@ -395,18 +410,51 @@ if(LLVM_ENABLE_PIC)
)
check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+
if(NOT CXX_HAS_FRTTI_FLAG)
message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
endif()
- target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
+
+ target_compile_options(mlir_sycl_runtime PUBLIC -fexceptions -frtti)
target_include_directories(mlir_sycl_runtime PRIVATE
${MLIR_INCLUDE_DIRS}
)
- target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+ target_link_libraries(mlir_sycl_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime SyclRuntime::SyclRuntime)
+
+ set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+ endif()
+
+ if(MLIR_ENABLE_LEVEL_ZERO_RUNNER)
+ find_package(LevelZeroRuntime)
+
+ if(NOT LevelZeroRuntime_FOUND)
+ message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+ endif()
+
+ add_mlir_library(mlir_levelzero_runtime
+ SHARED
+ LevelZeroRuntimeWrappers.cpp
+
+ EXCLUDE_FROM_LIBMLIR
+ )
+
+ check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+
+ if(NOT CXX_HAS_FRTTI_FLAG)
+ message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+ endif()
+
+ target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti)
+
+ target_include_directories(mlir_levelzero_runtime PRIVATE
+ ${MLIR_INCLUDE_DIRS}
+ )
+
+ target_link_libraries(mlir_levelzero_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime)
- set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+ set_property(TARGET mlir_levelzero_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}")
endif()
if(MLIR_ENABLE_SPIRV_CPU_RUNNER)
@@ -422,25 +470,26 @@ if(LLVM_ENABLE_PIC)
mlir_spirv_cpu_runtime_EXPORTS)
endif()
- if (MLIR_ENABLE_VULKAN_RUNNER)
+ if(MLIR_ENABLE_VULKAN_RUNNER)
find_package(Vulkan)
# If Vulkan is not found try a path specified by VULKAN_SDK.
- if (NOT Vulkan_FOUND)
- if ("$ENV{VULKAN_SDK}" STREQUAL "")
+ if(NOT Vulkan_FOUND)
+ if("$ENV{VULKAN_SDK}" STREQUAL "")
message(FATAL_ERROR "Vulkan not found through CMake; please provide "
- "VULKAN_SDK path as an environment variable")
+ "VULKAN_SDK path as an environment variable")
endif()
find_library(Vulkan_LIBRARY vulkan HINTS "$ENV{VULKAN_SDK}/lib" REQUIRED)
- if (Vulkan_LIBRARY)
+
+ if(Vulkan_LIBRARY)
set(Vulkan_FOUND ON)
set(Vulkan_INCLUDE_DIR "$ENV{VULKAN_SDK}/include")
message(STATUS "Found Vulkan: " ${Vulkan_LIBRARY})
endif()
endif()
- if (NOT Vulkan_FOUND)
+ if(NOT Vulkan_FOUND)
message(FATAL_ERROR "Cannot find Vulkan library")
endif()
diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
new file mode 100644
index 0000000000000..70ac4761dc7fd
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -0,0 +1,491 @@
+//===- LevelZeroRuntimeWrappers.cpp - MLIR Level Zero (L0) wrapper library-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements wrappers around the Level Zero (L0) runtime library with C linkage
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+
+#include <cassert>
+#include <deque>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <level_zero/ze_api.h>
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+namespace {
+
+template <typename F>
+auto catchAll(F &&func) {
+ try {
+ return func();
+ } catch (const std::exception &e) {
+ std::cerr << "An exception was thrown: " << e.what() << std::endl;
+ std::abort();
+ } catch (...) {
+ std::cerr << "An unknown exception was thrown." << std::endl;
+ std::abort();
+ }
+}
+
+#define L0_SAFE_CALL(call) \
+ { \
+ ze_result_t status = (call); \
+ if (status != ZE_RESULT_SUCCESS) { \
+ std::cerr << "L0 error " << status << std::endl; \
+ std::abort(); \
+ } \
+ }
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// L0 RT context & device setters
+//===----------------------------------------------------------------------===//
+
+// Returns the L0 driver handle for the given index. Default index is 0
+// (i.e., returns the first driver handle of the available drivers).
+
+static ze_driver_handle_t getDriver(uint32_t idx = 0) {
+ ze_init_driver_type_desc_t driver_type = {};
+ driver_type.stype = ZE_STRUCTURE_TYPE_INIT_DRIVER_TYPE_DESC;
+ driver_type.flags = ZE_INIT_DRIVER_TYPE_FLAG_GPU;
+ driver_type.pNext = nullptr;
+ uint32_t driverCount{0};
+ thread_local static std::vector<ze_driver_handle_t> drivers;
+
+ thread_local static bool isDriverInitialised{false};
+ if (isDriverInitialised)
+ return drivers[idx];
+ L0_SAFE_CALL(zeInitDrivers(&driverCount, nullptr, &driver_type));
+ if (!driverCount)
+ throw std::runtime_error("No L0 drivers found.");
+ drivers.resize(driverCount);
+ L0_SAFE_CALL(zeInitDrivers(&driverCount, drivers.data(), &driver_type));
+ if (idx >= driverCount)
+ throw std::runtime_error((llvm::Twine("Requested driver idx out-of-bound, "
+ "number of availabe drivers: ") +
+ std::to_string(driverCount))
+ .str());
+ isDriverInitialised = true;
+ return drivers[idx];
+}
+
+static ze_device_handle_t getDefaultDevice(const uint32_t driverIdx = 0,
+ const int32_t devIdx = 0) {
+ thread_local static ze_device_handle_t l0Device;
+ thread_local static int32_t currDevIdx{-1};
+ if (devIdx == currDevIdx)
+ return l0Device;
+ auto driver = getDriver(driverIdx);
+ uint32_t deviceCount{0};
+ L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr));
+ if (!deviceCount)
+ throw std::runtime_error(
+ "getDefaultDevice failed: did not find L0 device.");
+ if (static_cast<int>(deviceCount) < devIdx + 1)
+ throw std::runtime_error("getDefaultDevice failed: devIdx out-of-bounds.");
+ std::vector<ze_device_handle_t> devices(deviceCount);
+ L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data()));
+ l0Device = devices[devIdx];
+ currDevIdx = devIdx;
+ return l0Device;
+}
+
+// Returns the default L0 context of the defult driver.
+static ze_context_handle_t getDefaultContext() {
+ thread_local static ze_context_handle_t context;
+ thread_local static bool isContextInitialised{false};
+ if (isContextInitialised)
+ return context;
+ ze_context_desc_t ctxtDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+ auto driver = getDriver();
+ L0_SAFE_CALL(zeContextCreate(driver, &ctxtDesc, &context));
+ isContextInitialised = true;
+ return context;
+}
+
+//===----------------------------------------------------------------------===//
+// L0 RT helper structs
+//===----------------------------------------------------------------------===//
+
+struct L0RtContext {
+ ze_driver_handle_t driver{nullptr};
+ ze_device_handle_t device{nullptr};
+ ze_context_handle_t context{nullptr};
+ // Usually, one immediate command list with ordinal 0 suffices for
+ // both copy and compute ops, but leaves HW underutilized.
+ ze_command_list_handle_t immCmdListCompute{nullptr};
+ // Copy engines can be used for both memcpy and memset, but
+ // they have limitations for memset pattern size (e.g., 1 byte).
+ ze_command_list_handle_t immCmdListCopy{nullptr};
+ uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
+
+ L0RtContext(const int32_t devIdx = 0)
+ : driver(getDriver()), device(getDefaultDevice(devIdx)),
+ context(getDefaultContext()) {
+ uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u;
+ ze_device_properties_t deviceProperties = {};
+ L0_SAFE_CALL(zeDeviceGetProperties(device, &deviceProperties));
+ uint32_t queueGroupCount = 0;
+ L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
+ device, &queueGroupCount, nullptr));
+ std::vector<ze_command_queue_group_properties_t> queueGroupProperties(
+ queueGroupCount);
+ L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
+ device, &queueGroupCount, queueGroupProperties.data()));
+ for (uint32_t queueGroupIdx = 0; queueGroupIdx < queueGroupCount;
+ ++queueGroupIdx) {
+ const auto &group = queueGroupProperties[queueGroupIdx];
+ if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
+ computeEngineOrdinal = queueGroupIdx;
+ else if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) {
+ copyEngineOrdinal = queueGroupIdx;
+ copyEngineMaxMemoryFillPatternSize = group.maxMemoryFillPatternSize;
+ }
+ if (copyEngineOrdinal != -1u && computeEngineOrdinal != -1u)
+ break;
+ }
+ // Fallback to the default queue if no dedicated copy queue is available.
+ if (copyEngineOrdinal == -1u)
+ copyEngineOrdinal = computeEngineOrdinal;
+ assert(copyEngineOrdinal != -1u && computeEngineOrdinal != -1u &&
+ "Expected two engines to be available.");
+ ze_command_queue_desc_t cmdQueueDesc{
+ ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+ nullptr,
+ copyEngineOrdinal, // ordinal
+ 0, // index (assume one physical engine in the group)
+ 0, // flags
+ ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+ ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+ L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
+ &immCmdListCopy));
+ cmdQueueDesc.ordinal = computeEngineOrdinal;
+ L0_SAFE_CALL(zeCommandListCreateImmediate(context, device, &cmdQueueDesc,
+ &immCmdListCompute));
+ }
+ void cleanup() {
+ L0_SAFE_CALL(zeCommandListDestroy(immCmdListCopy));
+ L0_SAFE_CALL(zeCommandListDestroy(immCmdListCompute));
+ L0_SAFE_CALL(zeContextDestroy(context));
+ }
+ ~L0RtContext() { cleanup(); }
+};
+
+// L0 only supports pre-determined sizes of event pools,
+// implement a rt data struct to avoid running out of events.
+struct DynamicEventPool {
+ constexpr static size_t numEventsPerPool{128};
+ std::vector<ze_event_pool_handle_t> eventPools;
+ std::vector<ze_event_handle_t> availableEvents;
+ std::unordered_set<ze_event_handle_t> takenEvents;
+ size_t currentEventsLimit{0};
+ size_t currentEventsCnt{0};
+ L0RtContext *rtCtx;
+
+ DynamicEventPool(L0RtContext *rtCtx) : rtCtx(rtCtx) {
+ createNewPool(numEventsPerPool);
+ }
+
+ ~DynamicEventPool() {
+ assert(!takenEvents.size());
+ // zeEventDestroy will trigger L0_SAFE_CALL if an event is still used by
+ // device
+ for (auto event : availableEvents)
+ L0_SAFE_CALL(zeEventDestroy(event));
+ for (auto pool : eventPools)
+ L0_SAFE_CALL(zeEventPoolDestroy(pool));
+ }
+
+ void createNewPool(size_t numEvents) {
+ ze_event_pool_desc_t eventPoolDesc = {};
+ eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+ eventPoolDesc.count = numEvents;
+ eventPools.push_back(nullptr);
+ L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context, &eventPoolDesc, 1,
+ &rtCtx->device, &eventPools.back()));
+ currentEventsLimit += numEvents;
+ }
+
+ ze_event_handle_t takeEvent() {
+ ze_event_handle_t event{nullptr};
+ if (availableEvents.size()) {
+ event = availableEvents.back();
+ availableEvents.pop_back();
+ } else {
+ if (currentEventsCnt == currentEventsLimit)
+ createNewPool(numEventsPerPool);
+ currentEventsCnt++;
+ ze_event_desc_t eventDesc = {
+ ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
+ static_cast<uint32_t>(currentEventsCnt % numEventsPerPool),
+ ZE_EVENT_SCOPE_FLAG_DEVICE, ZE_EVENT_SCOPE_FLAG_HOST};
+ L0_SAFE_CALL(zeEventCreate(eventPools.back(), &eventDesc, &event));
+ }
+ takenEvents.insert(event);
+ return event;
+ }
+
+ void releaseEvent(ze_event_handle_t event) {
+ auto found = takenEvents.find(event);
+ assert(found != takenEvents.end());
+ takenEvents.erase(found);
+ L0_SAFE_CALL(zeEventHostReset(event));
+ availableEvents.push_back(event);
+ }
+};
+
+L0RtContext &getRtContext() {
+ thread_local static L0RtContext rtContext;
+ return rtContext;
+}
+
+DynamicEventPool &getDynamicEventPool() {
+ thread_local static DynamicEventPool dynEventPool{&getRtContext()};
+ return dynEventPool;
+}
+
+struct StreamWrapper {
+ // avoid event pointer invalidations
+ std::deque<ze_event_handle_t> implicitEventStack;
+ DynamicEventPool &dynEventPool;
+
+ StreamWrapper(DynamicEventPool &dynEventPool) : dynEventPool(dynEventPool) {}
+ ~StreamWrapper() { sync(); }
+
+ ze_event_handle_t *getLastImplicitEventPtr() {
+ // Assume current implicit events will not be used after `sync`.
+ return implicitEventStack.size() ? &implicitEventStack.back() : nullptr;
+ }
+
+ void sync(ze_event_handle_t explicitEvent = nullptr) {
+ ze_event_handle_t syncEvent{nullptr};
+ if (!explicitEvent) {
+ ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
+ syncEvent = lastImplicitEventPtr ? *lastImplicitEventPtr : nullptr;
+ } else {
+ syncEvent = explicitEvent;
+ }
+ if (syncEvent)
+ L0_SAFE_CALL(zeEventHostSynchronize(
+ syncEvent, std::numeric_limits<uint64_t>::max()));
+ // All of the "implicit" events were signaled and are of no use, release
+ // them. "explicit" event must be "released" via mgpuEventDestroy
+ for (auto event : implicitEventStack)
+ dynEventPool.releaseEvent(event);
+ implicitEventStack.clear();
+ }
+
+ void enqueueOp(
+ std::function<void(ze_event_handle_t, uint32_t, ze_event_handle_t *)>
+ op) {
+ ze_event_handle_t newImplicitEvent = dynEventPool.takeEvent();
+ ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
+ const uint32_t numWaitEvents = lastImplicitEventPtr ? 1 : 0;
+ op(newImplicitEvent, numWaitEvents, lastImplicitEventPtr);
+ implicitEventStack.push_back(newImplicitEvent);
+ }
+};
+
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
+ assert(data);
+ ze_module_handle_t zeModule;
+ ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+ nullptr,
+ ZE_MODULE_FORMAT_IL_SPIRV,
+ dataSize,
+ (const uint8_t *)data,
+ nullptr,
+ nullptr};
+ ze_module_build_log_handle_t buildLogHandle;
+ ze_result_t result =
+ zeModuleCreate(getRtContext().context, getRtContext().device, &desc,
+ &zeModule, &buildLogHandle);
+ if (result != ZE_RESULT_SUCCESS) {
+ std::cerr << "Error creating module, error code: " << result << std::endl;
+ size_t logSize = 0;
+ L0_SAFE_CALL(zeModuleBuildLogGetString(buildLogHandle, &logSize, nullptr));
+ std::string buildLog(" ", logSize);
+ L0_SAFE_CALL(
+ zeModuleBuildLogGetString(buildLogHandle, &logSize, buildLog.data()));
+ std::cerr << "Build log:\n" << buildLog << std::endl;
+ std::abort();
+ }
+ return zeModule;
+}
+
+//===----------------------------------------------------------------------===//
+// L0 Wrappers definition
+//===----------------------------------------------------------------------===//
+
+extern "C" StreamWrapper *mgpuStreamCreate() {
+ return new StreamWrapper(getDynamicEventPool());
+}
+
+extern "C" void mgpuStreamSynchronize(StreamWrapper *stream) {
+ if (stream)
+ stream->sync();
+}
+
+extern "C" void mgpuStreamDestroy(StreamWrapper *stream) {
+ if (stream)
+ delete stream;
+}
+
+extern "C" void mgpuStreamWaitEvent(StreamWrapper *stream,
+ ze_event_handle_t event) {
+ assert(stream && event);
+ stream->sync(event);
+}
+
+extern "C" ze_event_handle_t mgpuEventCreate() {
+ return getDynamicEventPool().takeEvent();
+}
+
+extern "C" void mgpuEventDestroy(ze_event_handle_t event) {
+ return getDynamicEventPool().releaseEvent(event);
+}
+
+extern "C" void mgpuEventSynchronize(ze_event_handle_t event) {
+ L0_SAFE_CALL(
+ zeEventHostSynchronize(event, std::numeric_limits<uint64_t>::max()));
+ L0_SAFE_CALL(zeEventHostReset(event));
+}
+
+extern "C" void mgpuEventRecord(ze_event_handle_t event,
+ StreamWrapper *stream) {
+ L0_SAFE_CALL(
+ zeCommandListAppendSignalEvent(getRtContext().immCmdListCopy, event));
+ L0_SAFE_CALL(
+ zeCommandListAppendSignalEvent(getRtContext().immCmdListCompute, event));
+}
+
+extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
+ bool isShared) {
+ return catchAll([&]() {
+ void *memPtr = nullptr;
+ constexpr size_t alignment{64};
+ ze_device_mem_alloc_desc_t deviceDesc = {};
+ deviceDesc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+ if (isShared) {
+ ze_host_mem_alloc_desc_t hostDesc = {};
+ hostDesc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+ L0_SAFE_CALL(zeMemAllocShared(getRtContext().context, &deviceDesc,
+ &hostDesc, size, alignment,
+ getRtContext().device, &memPtr));
+ } else {
+ L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context, &deviceDesc, size,
+ alignment, getRtContext().device, &memPtr));
+ }
+ if (!memPtr)
+ throw std::runtime_error("mem allocation failed!");
+ return memPtr;
+ });
+}
+
+extern "C" void mgpuMemFree(void *ptr, StreamWrapper *stream) {
+ stream->sync();
+ if (ptr)
+ L0_SAFE_CALL(zeMemFree(getRtContext().context, ptr));
+}
+
+extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
+ StreamWrapper *stream) {
+ stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+ ze_event_handle_t *waitEvents) {
+ L0_SAFE_CALL(zeCommandListAppendMemoryCopy(getRtContext().immCmdListCopy,
+ dst, src, sizeBytes, newEvent,
+ numWaitEvents, waitEvents));
+ });
+}
+
+template <typename PATTERN_TYPE>
+void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
+ StreamWrapper *stream) {
+ auto listType =
+ getRtContext().copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
+ ? getRtContext().immCmdListCopy
+ : getRtContext().immCmdListCompute;
+ stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+ ze_event_handle_t *waitEvents) {
+ L0_SAFE_CALL(zeCommandListAppendMemoryFill(
+ listType, dst, &value, sizeof(PATTERN_TYPE),
+ count * sizeof(PATTERN_TYPE), newEvent, numWaitEvents, waitEvents));
+ });
+}
+extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
+ StreamWrapper *stream) {
+ mgpuMemset<unsigned int>(dst, value, count, stream);
+}
+
+extern "C" void mgpuMemset16(void *dst, unsigned short value, size_t count,
+ StreamWrapper *stream) {
+ mgpuMemset<unsigned short>(dst, value, count, stream);
+}
+
+extern "C" ze_module_handle_t mgpuModuleLoad(const void *data,
+ size_t gpuBlobSize) {
+ return catchAll([&]() { return loadModule(data, gpuBlobSize); });
+}
+
+extern "C" ze_kernel_handle_t mgpuModuleGetFunction(ze_module_handle_t module,
+ const char *name) {
+ assert(module && name);
+ ze_kernel_handle_t zeKernel;
+ ze_kernel_desc_t desc = {};
+ desc.pKernelName = name;
+ L0_SAFE_CALL(zeKernelCreate(module, &desc, &zeKernel));
+ return zeKernel;
+}
+
+extern "C" void mgpuLaunchKernel(ze_kernel_handle_t kernel, size_t gridX,
+ size_t gridY, size_t gridZ, size_t blockX,
+ size_t blockY, size_t blockZ,
+ size_t sharedMemBytes, StreamWrapper *stream,
+ void **params, void ** /*extra*/,
+ size_t paramsCount) {
+
+ if (sharedMemBytes > 0) {
+ paramsCount = paramsCount - 1; // Last param is shared memory size
+ L0_SAFE_CALL(
+ zeKernelSetArgumentValue(kernel, paramsCount, sharedMemBytes, nullptr));
+ }
+ for (size_t i = 0; i < paramsCount; ++i)
+ L0_SAFE_CALL(zeKernelSetArgumentValue(kernel, static_cast<uint32_t>(i),
+ sizeof(void *), params[i]));
+ L0_SAFE_CALL(zeKernelSetGroupSize(kernel, blockX, blockY, blockZ));
+ ze_group_count_t dispatch;
+ dispatch.groupCountX = static_cast<uint32_t>(gridX);
+ dispatch.groupCountY = static_cast<uint32_t>(gridY);
+ dispatch.groupCountZ = static_cast<uint32_t>(gridZ);
+ stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
+ ze_event_handle_t *waitEvents) {
+ L0_SAFE_CALL(zeCommandListAppendLaunchKernel(
+ getRtContext().immCmdListCompute, kernel, &dispatch, newEvent,
+ numWaitEvents, waitEvents));
+ });
+}
+
+extern "C" void mgpuModuleUnload(ze_module_handle_t module) {
+ L0_SAFE_CALL(zeModuleDestroy(module));
+}
+
+extern "C" void mgpuSetDefaultDevice(int32_t devIdx) {
+ catchAll([&]() {
+ // For now, a user must ensure that streams and events complete
+ // and are destroyed before switching a device.
+ getRtContext().cleanup();
+ getRtContext() = L0RtContext(devIdx);
+ getDynamicEventPool() = DynamicEventPool(&getRtContext());
+ });
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
new file mode 100644
index 0000000000000..e4b566b74c862
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addf32-to-spirv.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+ memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+ %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+ %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+ %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ return
+ }
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %mem = gpu.alloc host_shared () : memref<2x2x2xf32>
+ memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
+ memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<2x2x2xf32>
+ memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
+ %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
+ gpu.wait [%7]
+ return %alloc : memref<2x2x2xf32>
+ }
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = gpu.block_id z
+ %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+ %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+ %5 = arith.addf %3, %4 : f32
+ memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [2.3, 4.5]
+ // CHECK: [7.8, 10.2]
+ // CHECK: [12.7, 14.9]
+ // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
new file mode 100644
index 0000000000000..c5aecd569ea75
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-addi64-to-spirv.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+ memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+ %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+ %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+ %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+ call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+ return
+ }
+ func.func private @printMemrefI64(memref<*xi64>)
+ func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+ %c3 = arith.constant 3 : index
+ %c1 = arith.constant 1 : index
+ %mem = gpu.alloc host_shared () : memref<3x3xi64>
+ memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
+ %memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
+ memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+ %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<3x3xi64>
+ memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+ %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
+ gpu.wait [%7]
+ return %alloc : memref<3x3xi64>
+ }
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+ %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+ %4 = arith.addi %2, %3 : i64
+ memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+ gpu.return
+ }
+ }
+ // CHECK: [2, 4100, 6],
+ // CHECK: [16777224, 10, 4294971404],
+ // CHECK: [16777230, 1103806595088, 1099511627794]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
new file mode 100644
index 0000000000000..94a27906ed1f3
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-memcpy-addf32-to-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+ memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+ %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+ %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+ %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ memref.dealloc %2 : memref<2x2x2xf32>
+ return
+ }
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %memref = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_0 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_1 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
+ %alloc = memref.alloc() : memref<2x2x2xf32>
+ gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ gpu.dealloc %memref_1 : memref<2x2x2xf32>
+ gpu.dealloc %memref_0 : memref<2x2x2xf32>
+ gpu.dealloc %memref : memref<2x2x2xf32>
+ return %alloc : memref<2x2x2xf32>
+ }
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = gpu.block_id z
+ %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+ %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+ %5 = arith.addf %3, %4 : f32
+ memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [2.3, 4.5]
+ // CHECK: [7.8, 10.2]
+ // CHECK: [12.7, 14.9]
+ // CHECK: [18.2, 20.6]
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
new file mode 100644
index 0000000000000..e385daefcb9b5
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/gpu-reluf32-to-spirv.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @relu attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[
+ [-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01],
+ [1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01],
+ [1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01],
+ [1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01]
+ ]>
+
+ func.func @main() {
+ %c1 = arith.constant 1 : index
+ %c100 = arith.constant 100 : index
+ %c0 = arith.constant 0 : index
+ %0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32>
+
+ scf.for %arg0 = %c0 to %c100 step %c1 {
+ %1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32>
+ %cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32>
+ func.call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ // CHECK: [0, 0, 0, 0.4, 0.5],
+ // CHECK: [0.1, 0, 0.3, 0, 0.5],
+ // CHECK: [0.1, 0.2, 0.3, 0, 0],
+ // CHECK: [0.1, 0.2, 0.3, 0.4, 0.5]
+ }
+ return
+ }
+
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> {
+ %c5 = arith.constant 5 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c1 = arith.constant 1 : index
+ %memref = gpu.alloc host_shared () : memref<4x5xf32>
+ memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
+ %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
+ %2 = gpu.wait async
+ %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
+ gpu.wait [%3]
+ %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
+ %4 = gpu.wait async
+ %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
+ gpu.wait [%5]
+ %alloc = memref.alloc() : memref<4x5xf32>
+ memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
+ %6 = gpu.wait async
+ %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
+ %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
+ %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
+ return %alloc : memref<4x5xf32>
+ }
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
+ %3 = arith.cmpf olt, %2, %arg1 : f32
+ memref.store %3, %arg2[%0, %1] : memref<4x5xi1>
+ gpu.return
+ }
+ }
+ gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
+ %3 = memref.load %arg1[%0, %1] : memref<4x5xf32>
+ %4 = arith.select %2, %arg2, %3 : f32
+ memref.store %4, %arg3[%0, %1] : memref<4x5xf32>
+ gpu.return
+ }
+ }
+}
diff --git a/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg b/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
new file mode 100644
index 0000000000000..36c7ad5f57c7e
--- /dev/null
+++ b/mlir/test/Integration/GPU/LEVELZERO/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_levelzero_runner:
+ config.unsupported = True
More information about the Mlir-commits
mailing list