[flang-commits] [flang] Enabling Intel GPU Integration. (PR #65539)
Sang Ik Lee via flang-commits
flang-commits at lists.llvm.org
Wed Sep 6 15:02:14 PDT 2023
https://github.com/silee2 created https://github.com/llvm/llvm-project/pull/65539:
None
>From 863a72b4e099f4aa24e43fdaaeb2ab0e171a0381 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 30 Aug 2023 13:44:02 -0700
Subject: [PATCH 01/13] Add SyclRuntimeWrappers and Add CMake option
MLIR_ENABLE_SYCL_RUNNER
---
mlir/CMakeLists.txt | 1 +
mlir/cmake/modules/FindLevelZero.cmake | 221 ++++++++++
mlir/cmake/modules/FindSyclRuntime.cmake | 68 +++
mlir/lib/ExecutionEngine/CMakeLists.txt | 35 ++
.../ExecutionEngine/SyclRuntimeWrappers.cpp | 386 ++++++++++++++++++
5 files changed, 711 insertions(+)
create mode 100644 mlir/cmake/modules/FindLevelZero.cmake
create mode 100644 mlir/cmake/modules/FindSyclRuntime.cmake
create mode 100644 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index fa4f6e76f985fb5..4a67e018273819f 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -116,6 +116,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS})
set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner")
set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner")
+set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner")
set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner")
set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner")
set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake
new file mode 100644
index 000000000000000..012187f0afc0b07
--- /dev/null
+++ b/mlir/cmake/modules/FindLevelZero.cmake
@@ -0,0 +1,221 @@
+# CMake find_package() module for level-zero
+#
+# Example usage:
+#
+# find_package(LevelZero)
+#
+# If successful, the following variables will be defined:
+# LevelZero_FOUND
+# LevelZero_INCLUDE_DIRS
+# LevelZero_LIBRARY
+# LevelZero_LIBRARIES_DIR
+#
+# By default, the module searches the standard paths to locate the "ze_api.h"
+# and the ze_loader shared library. When using a custom level-zero installation,
+# the environment variable "LEVEL_ZERO_DIR" should be specified telling the
+# module to get the level-zero library and headers from that location.
+
+include(FindPackageHandleStandardArgs)
+
+# Search path priority
+# 1. CMake Variable LEVEL_ZERO_DIR
+# 2. Environment Variable LEVEL_ZERO_DIR
+
+if(NOT LEVEL_ZERO_DIR)
+ if(DEFINED ENV{LEVEL_ZERO_DIR})
+ set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
+ endif()
+endif()
+
+if(LEVEL_ZERO_DIR)
+ find_path(LevelZero_INCLUDE_DIR
+ NAMES level_zero/ze_api.h
+ PATHS ${LEVEL_ZERO_DIR}/include
+ NO_DEFAULT_PATH
+ )
+
+ if(LINUX)
+ find_library(LevelZero_LIBRARY
+ NAMES ze_loader
+ PATHS ${LEVEL_ZERO_DIR}/lib
+ ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+ NO_DEFAULT_PATH
+ )
+ else()
+ find_library(LevelZero_LIBRARY
+ NAMES ze_loader
+ PATHS ${LEVEL_ZERO_DIR}/lib
+ NO_DEFAULT_PATH
+ )
+ endif()
+else()
+ find_path(LevelZero_INCLUDE_DIR
+ NAMES level_zero/ze_api.h
+ )
+
+ find_library(LevelZero_LIBRARY
+ NAMES ze_loader
+ )
+endif()
+
+# Compares the two version string that are supposed to be in x.y.z format
+# and reports if the argument VERSION_STR1 is greater than or equal than
+# version_str2. The strings are compared lexicographically after conversion to
+# lists of equal lengths, with the shorter string getting zero-padded.
+function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
+ # Convert the strings to list
+ string(REPLACE "." ";" VL1 ${VERSION_STR1})
+ string(REPLACE "." ";" VL2 ${VERSION_STR2})
+ # get lengths of both lists
+ list(LENGTH VL1 VL1_LEN)
+ list(LENGTH VL2 VL2_LEN)
+ set(LEN ${VL1_LEN})
+ # If they differ in size pad the shorter list with 0s
+ if(VL1_LEN GREATER VL2_LEN)
+ math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+ foreach(IDX RANGE 1 ${DIFF} 1)
+ list(APPEND VL2 "0")
+ endforeach()
+ elseif(VL2_LEN GREATER VL2_LEN)
+ math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+ foreach(IDX RANGE 1 ${DIFF} 1)
+ list(APPEND VL2 "0")
+ endforeach()
+ set(LEN ${VL2_LEN})
+ endif()
+ math(EXPR LEN_SUB_ONE "${LEN}-1")
+ foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
+ list(GET VL1 ${IDX} VAL1)
+ list(GET VL2 ${IDX} VAL2)
+
+ if(${VAL1} GREATER ${VAL2})
+ set(${OUTPUT} TRUE PARENT_SCOPE)
+ break()
+ elseif(${VAL1} LESS ${VAL2})
+ set(${OUTPUT} FALSE PARENT_SCOPE)
+ break()
+ else()
+ set(${OUTPUT} TRUE PARENT_SCOPE)
+ endif()
+ endforeach()
+
+ endfunction(compare_versions)
+
+# Creates a small function to run and extract the LevelZero loader version.
+function(get_l0_loader_version)
+
+ set(L0_VERSIONEER_SRC
+ [====[
+ #include <iostream>
+ #include <level_zero/loader/ze_loader.h>
+ #include <string>
+ int main() {
+ ze_result_t result;
+ std::string loader("loader");
+ zel_component_version_t *versions;
+ size_t size = 0;
+ result = zeInit(0);
+ if (result != ZE_RESULT_SUCCESS) {
+ std::cerr << "Failed to init ze driver" << std::endl;
+ return -1;
+ }
+ zelLoaderGetVersions(&size, nullptr);
+ versions = new zel_component_version_t[size];
+ zelLoaderGetVersions(&size, versions);
+ for (size_t i = 0; i < size; i++) {
+ if (loader.compare(versions[i].component_name) == 0) {
+ std::cout << versions[i].component_lib_version.major << "."
+ << versions[i].component_lib_version.minor << "."
+ << versions[i].component_lib_version.patch;
+ break;
+ }
+ }
+ delete[] versions;
+ return 0;
+ }
+ ]====]
+ )
+
+ set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp)
+
+ file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}")
+
+ # We need both the directories in the include path as ze_loader.h
+ # includes "ze_api.h" and not "level_zero/ze_api.h".
+ list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+ list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+ list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
+ try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
+ "${CMAKE_BINARY_DIR}"
+ "${L0_VERSIONEER_FILE}"
+ LINK_LIBRARIES ${LevelZero_LIBRARY}
+ CMAKE_FLAGS
+ "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+ RUN_OUTPUT_VARIABLE L0_VERSION
+ )
+ if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
+ set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+ message(STATUS "Found Level Zero of version: ${L0_VERSION}")
+ else()
+ message(FATAL_ERROR
+ "Could not compile a level-zero program to extract loader version"
+ )
+ endif()
+endfunction(get_l0_loader_version)
+
+if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
+ list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
+ list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+ if(OpenCL_FOUND)
+ list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+ endif()
+
+ cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
+ set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
+
+ if(NOT TARGET LevelZero::LevelZero)
+ add_library(LevelZero::LevelZero INTERFACE IMPORTED)
+ set_target_properties(LevelZero::LevelZero
+ PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
+ )
+ set_target_properties(LevelZero::LevelZero
+ PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
+ )
+ endif()
+endif()
+
+# Check if a specific version of Level Zero is required
+if(LevelZero_FIND_VERSION)
+ get_l0_loader_version()
+ set(VERSION_GT_FIND_VERSION FALSE)
+ compare_versions(
+ ${LevelZero_VERSION}
+ ${LevelZero_FIND_VERSION}
+ VERSION_GT_FIND_VERSION
+ )
+ if(${VERSION_GT_FIND_VERSION})
+ set(LevelZero_FOUND TRUE)
+ else()
+ set(LevelZero_FOUND FALSE)
+ endif()
+else()
+ set(LevelZero_FOUND TRUE)
+endif()
+
+find_package_handle_standard_args(LevelZero
+ REQUIRED_VARS
+ LevelZero_FOUND
+ LevelZero_INCLUDE_DIRS
+ LevelZero_LIBRARY
+ LevelZero_LIBRARIES_DIR
+ HANDLE_COMPONENTS
+)
+mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+
+if(LevelZero_FOUND)
+ find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
+ "(found version ${LevelZero_VERSION})"
+ )
+else()
+ find_package_message(LevelZero "Could not find LevelZero" "")
+endif()
diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake
new file mode 100644
index 000000000000000..38b065a3f284c2c
--- /dev/null
+++ b/mlir/cmake/modules/FindSyclRuntime.cmake
@@ -0,0 +1,68 @@
+# CMake find_package() module for SYCL Runtime
+#
+# Example usage:
+#
+# find_package(SyclRuntime)
+#
+# If successful, the following variables will be defined:
+# SyclRuntime_FOUND
+# SyclRuntime_INCLUDE_DIRS
+# SyclRuntime_LIBRARY
+# SyclRuntime_LIBRARIES_DIR
+#
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT DEFINED ENV{CMPLR_ROOT})
+ message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)")
+ message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers")
+else()
+ if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux"))
+ set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux")
+ elseif(WIN32)
+ set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows")
+ endif()
+ list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include")
+ list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl")
+
+ set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib")
+
+ message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}")
+ find_library(SyclRuntime_LIBRARY
+ NAMES sycl
+ PATHS ${SyclRuntime_LIBRARY_DIR}
+ NO_DEFAULT_PATH
+ )
+endif()
+
+if(SyclRuntime_LIBRARY)
+ set(SyclRuntime_FOUND TRUE)
+ if(NOT TARGET SyclRuntime::SyclRuntime)
+ add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED)
+ set_target_properties(SyclRuntime::SyclRuntime
+ PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}"
+ )
+ set_target_properties(SyclRuntime::SyclRuntime
+ PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}"
+ )
+ endif()
+else()
+ set(SyclRuntime_FOUND FALSE)
+endif()
+
+find_package_handle_standard_args(SyclRuntime
+ REQUIRED_VARS
+ SyclRuntime_FOUND
+ SyclRuntime_INCLUDE_DIRS
+ SyclRuntime_LIBRARY
+ SyclRuntime_LIBRARY_DIR
+ HANDLE_COMPONENTS
+)
+
+mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS)
+
+if(SyclRuntime_FOUND)
+ find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "")
+else()
+ find_package_message(SyclRuntime "Could not find SyclRuntime" "")
+endif()
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index ea33c2c6ed261e1..47b1e82d60ff03f 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -328,4 +328,39 @@ if(LLVM_ENABLE_PIC)
hip::host hip::amdhip64
)
endif()
+
+ if(MLIR_ENABLE_SYCL_RUNNER)
+ find_package(SyclRuntime)
+
+ if(NOT SyclRuntime_FOUND)
+ message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
+ endif()
+
+ find_package(LevelZero)
+
+ if(NOT LevelZero_FOUND)
+ message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+ endif()
+
+ add_mlir_library(sycl-runtime
+ SHARED
+ SyclRuntimeWrappers.cpp
+
+ EXCLUDE_FROM_LIBMLIR
+ )
+
+ check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+ if(NOT CXX_HAS_FRTTI_FLAG)
+ message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+ endif()
+ target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti)
+
+ target_include_directories(sycl-runtime PRIVATE
+ ${MLIR_INCLUDE_DIRS}
+ )
+
+ target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+
+ set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+ endif()
endif()
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
new file mode 100644
index 000000000000000..ed5ed2170f411c3
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -0,0 +1,386 @@
+//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cfloat>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <level_zero/ze_api.h>
+#include <map>
+#include <mutex>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+
+#ifdef _WIN32
+#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define SYCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+template <typename F> auto catchAll(F &&func) {
+ try {
+ return func();
+ } catch (const std::exception &e) {
+ fprintf(stdout, "An exception was thrown: %s\n", e.what());
+ fflush(stdout);
+ abort();
+ } catch (...) {
+ fprintf(stdout, "An unknown exception was thrown\n");
+ fflush(stdout);
+ abort();
+ }
+}
+
+#define L0_SAFE_CALL(call) \
+ { \
+ ze_result_t status = (call); \
+ if (status != ZE_RESULT_SUCCESS) { \
+ fprintf(stdout, "L0 error %d\n", status); \
+ fflush(stdout); \
+ abort(); \
+ } \
+ }
+
+} // namespace
+
+struct SpirvModule {
+ ze_module_handle_t module = nullptr;
+ ~SpirvModule();
+};
+
+namespace {
+// Create a Map for the spirv module lookup
+std::map<void *, SpirvModule> moduleCache;
+std::mutex mutexLock;
+} // namespace
+
+SpirvModule::~SpirvModule() {
+ L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
+}
+
+struct ParamDesc {
+ void *data;
+ size_t size;
+
+ bool operator==(const ParamDesc &rhs) const {
+ return data == rhs.data && size == rhs.size;
+ }
+
+ bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
+};
+
+template <typename T> size_t countUntil(T *ptr, T &&elem) {
+ assert(ptr);
+ auto curr = ptr;
+ while (*curr != elem) {
+ ++curr;
+ }
+ return static_cast<size_t>(curr - ptr);
+}
+
+static sycl::device getDefaultDevice() {
+ auto platformList = sycl::platform::get_platforms();
+ for (const auto &platform : platformList) {
+ auto platformName = platform.get_info<sycl::info::platform::name>();
+ bool isLevelZero = platformName.find("Level-Zero") != std::string::npos;
+ if (!isLevelZero)
+ continue;
+
+ return platform.get_devices()[0];
+ }
+}
+
+struct GPUSYCLQUEUE {
+
+ sycl::device syclDevice_;
+ sycl::context syclContext_;
+ sycl::queue syclQueue_;
+
+ GPUSYCLQUEUE(sycl::property_list propList) {
+
+ syclDevice_ = getDefaultDevice();
+ syclContext_ = sycl::context(syclDevice_);
+ syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+ }
+
+ GPUSYCLQUEUE(sycl::device *device, sycl::context *context,
+ sycl::property_list propList) {
+ syclDevice_ = *device;
+ syclContext_ = *context;
+ syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+ }
+ GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) {
+
+ syclDevice_ = *device;
+ syclContext_ = sycl::context(syclDevice_);
+ syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+ }
+
+ GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) {
+
+ syclDevice_ = getDefaultDevice();
+ syclContext_ = *context;
+ syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+ }
+
+}; // end of GPUSYCLQUEUE
+
+static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
+ size_t alignment, bool isShared) {
+ void *memPtr = nullptr;
+ if (isShared) {
+ memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_);
+ } else {
+ memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_);
+ }
+ if (memPtr == nullptr) {
+ throw std::runtime_error(
+ "aligned_alloc_shared() failed to allocate memory!");
+ }
+ return memPtr;
+}
+
+static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+ sycl::free(ptr, queue->syclQueue_);
+}
+
+static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
+ size_t dataSize) {
+ assert(data);
+ auto syclQueue = queue->syclQueue_;
+ ze_module_handle_t zeModule;
+
+ auto it = moduleCache.find((void *)data);
+ // Check the map if the module is present/cached.
+ if (it != moduleCache.end()) {
+ return it->second.module;
+ }
+
+ ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+ nullptr,
+ ZE_MODULE_FORMAT_IL_SPIRV,
+ dataSize,
+ (const uint8_t *)data,
+ nullptr,
+ nullptr};
+ auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+ syclQueue.get_device());
+ auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+ syclQueue.get_context());
+ L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
+ std::lock_guard<std::mutex> entryLock(mutexLock);
+ moduleCache[(void *)data].module = zeModule;
+ return zeModule;
+}
+
+static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
+ const char *name) {
+ assert(zeModule);
+ assert(name);
+ auto syclQueue = queue->syclQueue_;
+ ze_kernel_handle_t zeKernel;
+ sycl::kernel *syclKernel;
+ ze_kernel_desc_t desc = {};
+ desc.pKernelName = name;
+
+ L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
+ sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
+ sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
+ sycl::bundle_state::executable>(
+ {zeModule}, syclQueue.get_context());
+
+ auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+ {kernelBundle, zeKernel}, syclQueue.get_context());
+ syclKernel = new sycl::kernel(kernel);
+ return syclKernel;
+}
+
+static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel,
+ sycl::nd_range<3> NdRange, ParamDesc *params,
+ size_t sharedMemBytes) {
+ auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
+ // The assumption is, if there is a param for the shared local memory,
+ // then that will always be the last argument.
+ if (sharedMemBytes) {
+ paramsCount = paramsCount - 1;
+ }
+ sycl::event event = queue.submit([&](sycl::handler &cgh) {
+ for (size_t i = 0; i < paramsCount; i++) {
+ auto param = params[i];
+ cgh.set_arg(static_cast<uint32_t>(i),
+ *(static_cast<void **>(param.data)));
+ }
+ if (sharedMemBytes) {
+ // TODO: Handle other data types
+ using share_mem_t =
+ sycl::accessor<float, 1, sycl::access::mode::read_write,
+ sycl::access::target::local>;
+ share_mem_t local_buffer =
+ share_mem_t(sharedMemBytes / sizeof(float), cgh);
+ cgh.set_arg(paramsCount, local_buffer);
+ cgh.parallel_for(NdRange, *kernel);
+ } else {
+ cgh.parallel_for(NdRange, *kernel);
+ }
+ });
+ return event;
+}
+
+static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
+ size_t gridX, size_t gridY, size_t gridZ,
+ size_t blockX, size_t blockY, size_t blockZ,
+ size_t sharedMemBytes, ParamDesc *params) {
+ auto syclQueue = queue->syclQueue_;
+ auto syclGlobalRange =
+ ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
+ auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
+ sycl::nd_range<3> syclNdRange(
+ sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
+
+ if (getenv("IMEX_ENABLE_PROFILING")) {
+ auto executionTime = 0.0f;
+ auto maxTime = 0.0f;
+ auto minTime = FLT_MAX;
+ auto rounds = 100;
+ auto warmups = 3;
+
+ if (getenv("IMEX_PROFILING_RUNS")) {
+ auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
+ if (runs)
+ rounds = runs;
+ }
+
+ if (getenv("IMEX_PROFILING_WARMUPS")) {
+ auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
+ if (warmups)
+ warmups = runs;
+ }
+
+ // warmups
+ for (int r = 0; r < warmups; r++) {
+ enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+ }
+
+ for (int r = 0; r < rounds; r++) {
+ sycl::event event =
+ enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+
+ auto startTime = event.get_profiling_info<
+ cl::sycl::info::event_profiling::command_start>();
+ auto endTime = event.get_profiling_info<
+ cl::sycl::info::event_profiling::command_end>();
+ auto gap = float(endTime - startTime) / 1000000.0f;
+ executionTime += gap;
+ if (gap > maxTime)
+ maxTime = gap;
+ if (gap < minTime)
+ minTime = gap;
+ }
+
+ fprintf(stdout,
+ "the kernel execution time is (ms):"
+ "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
+ executionTime / rounds, minTime, maxTime, rounds);
+ } else {
+ enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+ }
+}
+
+// Wrappers
+
+extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
+ void *context) {
+ auto propList = sycl::property_list{};
+ if (getenv("IMEX_ENABLE_PROFILING")) {
+ propList = sycl::property_list{sycl::property::queue::enable_profiling()};
+ }
+ return catchAll([&]() {
+ if (!device && !context) {
+ return new GPUSYCLQUEUE(propList);
+ } else if (device && context) {
+ // TODO: Check if the pointers/address is valid and holds the correct
+ // device and context
+ return new GPUSYCLQUEUE(static_cast<sycl::device *>(device),
+ static_cast<sycl::context *>(context), propList);
+ } else if (device && !context) {
+ return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList);
+ } else {
+ return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList);
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
+ catchAll([&]() { delete queue; });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void *
+gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+ return catchAll([&]() {
+ if (queue) {
+ return allocDeviceMemory(queue, size, alignment, isShared);
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
+ catchAll([&]() {
+ if (queue && ptr) {
+ deallocDeviceMemory(queue, ptr);
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
+gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) {
+ return catchAll([&]() {
+ if (queue) {
+ return loadModule(queue, data, dataSize);
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
+gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) {
+ return catchAll([&]() {
+ if (queue) {
+ return getKernel(queue, module, name);
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX,
+ size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
+ size_t blockZ, size_t sharedMemBytes, void *params) {
+ return catchAll([&]() {
+ if (queue) {
+ launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+ sharedMemBytes, static_cast<ParamDesc *>(params));
+ }
+ });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
+
+ catchAll([&]() {
+ if (queue) {
+ queue->syclQueue_.wait();
+ }
+ });
+}
>From 19bc391f87aba81196c0f4233b2a9ab808ade282 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 30 Aug 2023 20:59:43 +0000
Subject: [PATCH 02/13] Fix config and build issues.
---
mlir/lib/ExecutionEngine/CMakeLists.txt | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 47b1e82d60ff03f..101d9baafcccc49 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES
CRunnerUtils.cpp
CudaRuntimeWrappers.cpp
SparseTensorRuntime.cpp
+ SyclRuntimeWrappers.cpp
ExecutionEngine.cpp
Float16bits.cpp
RocmRuntimeWrappers.cpp
@@ -342,7 +343,7 @@ if(LLVM_ENABLE_PIC)
message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
endif()
- add_mlir_library(sycl-runtime
+ add_mlir_library(mlir_sycl_runtime
SHARED
SyclRuntimeWrappers.cpp
@@ -353,14 +354,14 @@ if(LLVM_ENABLE_PIC)
if(NOT CXX_HAS_FRTTI_FLAG)
message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
endif()
- target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti)
+ target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
- target_include_directories(sycl-runtime PRIVATE
+ target_include_directories(mlir_sycl_runtime PRIVATE
${MLIR_INCLUDE_DIRS}
)
- target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+ target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
- set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+ set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
endif()
endif()
>From 330d04db7155bbf416ee422934b47eac3dea70ad Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Aug 2023 15:09:11 -0700
Subject: [PATCH 03/13] Suppress clang compiler error.
---
mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index ed5ed2170f411c3..439d31134aa8b9a 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -58,6 +58,9 @@ template <typename F> auto catchAll(F &&func) {
} // namespace
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+
struct SpirvModule {
ze_module_handle_t module = nullptr;
~SpirvModule();
@@ -73,6 +76,8 @@ SpirvModule::~SpirvModule() {
L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
}
+#pragma clang diagnostic pop
+
struct ParamDesc {
void *data;
size_t size;
>From 203d23eefe5a32f4c13313579bd1d9b9630e2413 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 09:01:06 -0700
Subject: [PATCH 04/13] Add gpu serialize to spirv pass.
---
.../mlir/Dialect/GPU/Transforms/Passes.td | 4 ++
mlir/lib/Dialect/GPU/CMakeLists.txt | 1 +
.../GPU/Transforms/SerializeToSPIRV.cpp | 70 +++++++++++++++++++
.../GPU/Transforms/serialize-spirv.mlir | 53 ++++++++++++++
4 files changed, 128 insertions(+)
create mode 100644 mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
create mode 100644 mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index fc20bd2ed921aea..f285f45448ecc7e 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -187,4 +187,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> {
];
}
+def GpuSerializeToSPIRVPass : Pass<"gpu-serialize-to-spirv", "ModuleOp"> {
+ let summary = "Serialize spirv dialect to spirv binary";
+}
+
#endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6244132c073a4a6..e2c1dc7adf646f7 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
Transforms/SerializeToBlob.cpp
Transforms/SerializeToCubin.cpp
Transforms/SerializeToHsaco.cpp
+ Transforms/SerializeToSPIRV.cpp
Transforms/ShuffleRewriter.cpp
Transforms/ROCDLAttachTarget.cpp
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
new file mode 100644
index 000000000000000..f013f531371de86
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
@@ -0,0 +1,70 @@
+//===- SerializeToSPIRV.cpp - Convert GPU kernel to SPIRV blob -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This pass iterates all the SPIR-V modules in the top module and serializes
+/// each SPIR-V module to SPIR-V binary and then attachs the binary blob as a
+/// string attribute to the corresponding gpu module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Target/SPIRV/Serialization.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_GPUSERIALIZETOSPIRVPASS
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+struct GpuSerializeToSPIRVPass : public mlir::impl::GpuSerializeToSPIRVPassBase<GpuSerializeToSPIRVPass> {
+public:
+ void runOnOperation() override {
+ auto mod = getOperation();
+ llvm::SmallVector<uint32_t, 0> spvBinary;
+ for (mlir::gpu::GPUModuleOp gpuMod : mod.getOps<gpu::GPUModuleOp>()) {
+ auto name = gpuMod.getName();
+ // check that the spv module has the same name with gpu module except the
+ // prefix "__spv__"
+ auto isSameMod = [&](spirv::ModuleOp spvMod) -> bool {
+ auto spvModName = spvMod.getName();
+ return spvModName->consume_front("__spv__") && spvModName == name;
+ };
+ auto spvMods = mod.getOps<spirv::ModuleOp>();
+ auto it = llvm::find_if(spvMods, isSameMod);
+ if (it == spvMods.end()) {
+ gpuMod.emitError() << "Unable to find corresponding SPIR-V module";
+ signalPassFailure();
+ return;
+ }
+ auto spvMod = *it;
+
+ spvBinary.clear();
+ // serialize the spv module to spv binary
+ if (mlir::failed(spirv::serialize(spvMod, spvBinary))) {
+ spvMod.emitError() << "Failed to serialize SPIR-V module";
+ signalPassFailure();
+ return;
+ }
+
+ // attach the spv binary to the gpu module
+ auto spvData =
+ llvm::StringRef(reinterpret_cast<const char *>(spvBinary.data()),
+ spvBinary.size() * sizeof(uint32_t));
+ auto spvAttr = mlir::StringAttr::get(&getContext(), spvData);
+ gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr);
+ spvMod->erase();
+ }
+ }
+};
diff --git a/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
new file mode 100644
index 000000000000000..d70e18f3401d38d
--- /dev/null
+++ b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt -gpu-serialize-to-spirv %s | FileCheck %s
+module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, #spirv.resource_limits<>>} {
+ // CHECK: gpu.module @addt_kernel attributes {gpu.binary =
+ spirv.module @__spv__addt_kernel Physical64 OpenCL requires #spirv.vce<v1.0, [Int64, Addresses, Kernel], []> {
+ spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input>
+ spirv.func @addt_kernel(%arg0: !spirv.ptr<f32, CrossWorkgroup>, %arg1: !spirv.ptr<f32, CrossWorkgroup>, %arg2: !spirv.ptr<f32, CrossWorkgroup>) "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64} {
+ %cst5_i64 = spirv.Constant 5 : i64
+ %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+ %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64>
+ %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64>
+ %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+ %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64>
+ %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64>
+ spirv.Branch ^bb1
+ ^bb1: // pred: ^bb0
+ %4 = spirv.IMul %1, %cst5_i64 : i64
+ %5 = spirv.IAdd %4, %3 : i64
+ %6 = spirv.InBoundsPtrAccessChain %arg0[%5] : !spirv.ptr<f32, CrossWorkgroup>, i64
+ %7 = spirv.Load "CrossWorkgroup" %6 ["Aligned", 4] : f32
+ %8 = spirv.IMul %1, %cst5_i64 : i64
+ %9 = spirv.IAdd %8, %3 : i64
+ %10 = spirv.InBoundsPtrAccessChain %arg1[%9] : !spirv.ptr<f32, CrossWorkgroup>, i64
+ %11 = spirv.Load "CrossWorkgroup" %10 ["Aligned", 4] : f32
+ %12 = spirv.FAdd %7, %11 : f32
+ %13 = spirv.IMul %1, %cst5_i64 : i64
+ %14 = spirv.IAdd %13, %3 : i64
+ %15 = spirv.InBoundsPtrAccessChain %arg2[%14] : !spirv.ptr<f32, CrossWorkgroup>, i64
+ spirv.Store "CrossWorkgroup" %15, %12 ["Aligned", 4] : f32
+ spirv.Return
+ }
+ spirv.EntryPoint "Kernel" @addt_kernel, @__builtin_var_WorkgroupId__
+ }
+ gpu.module @addt_kernel {
+ gpu.func @addt_kernel(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %c5 = arith.constant 5 : index
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ cf.br ^bb1
+ ^bb1: // pred: ^bb0
+ %2 = arith.muli %0, %c5 : index
+ %3 = arith.addi %2, %1 : index
+ %4 = memref.load %arg0[%3] : memref<?xf32>
+ %5 = arith.muli %0, %c5 : index
+ %6 = arith.addi %5, %1 : index
+ %7 = memref.load %arg1[%6] : memref<?xf32>
+ %8 = arith.addf %4, %7 : f32
+ %9 = arith.muli %0, %c5 : index
+ %10 = arith.addi %9, %1 : index
+ memref.store %8, %arg2[%10] : memref<?xf32>
+ gpu.return
+ }
+ }
+}
>From a755e8f6ea0d919d62640b3aa41db93f793812dd Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 09:09:22 -0700
Subject: [PATCH 05/13] Add dependency.
---
mlir/lib/Dialect/GPU/CMakeLists.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index e2c1dc7adf646f7..38fa60ba06f59a9 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -97,6 +97,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRSupport
MLIRROCDLTarget
MLIRTransformUtils
+ MLIRSPIRVSerialization
)
add_subdirectory(TransformOps)
>From 1d5d04661841e50e29dac0620c33478b4a5f572b Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.pate at intel.com>
Date: Wed, 6 Sep 2023 17:17:30 +0000
Subject: [PATCH 06/13] Add Sycl Runtime Wrappers
---
.../GPUCommon/GPUToLLVMConversion.cpp | 24 +-
.../ExecutionEngine/CudaRuntimeWrappers.cpp | 5 +-
.../ExecutionEngine/RocmRuntimeWrappers.cpp | 4 +-
.../ExecutionEngine/SyclRuntimeWrappers.cpp | 301 ++++--------------
...ower-launch-func-to-gpu-runtime-calls.mlir | 7 +-
.../Integration/GPU/SYCL/gpu-to-spirv.mlir | 50 +++
6 files changed, 144 insertions(+), 247 deletions(-)
create mode 100644 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index eddf3e9a47d0bc8..111cfbf93f26a9b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -101,7 +101,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
FunctionCallBuilder moduleLoadCallBuilder = {
"mgpuModuleLoad",
llvmPointerType /* void *module */,
- {llvmPointerType /* void *cubin */}};
+ {llvmPointerType, /* void *cubin */
+ llvmInt64Type /* size_t size */}};
FunctionCallBuilder moduleUnloadCallBuilder = {
"mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
FunctionCallBuilder moduleGetFunctionCallBuilder = {
@@ -125,7 +126,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
llvmInt32Type, /* unsigned int sharedMemBytes */
llvmPointerType, /* void *hstream */
llvmPointerPointerType, /* void **kernelParams */
- llvmPointerPointerType /* void **extra */
+ llvmPointerPointerType, /* void **extra */
+ llvmInt64Type /* size_t paramsCount */
}};
FunctionCallBuilder streamCreateCallBuilder = {
"mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
@@ -1134,7 +1136,21 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
- auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
+ // SPIRV requires binary size
+ auto gpuBlob = binaryAttr.getValue();
+ auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, llvmInt64Type,
+ mlir::IntegerAttr::get(llvmInt64Type,
+ static_cast<int64_t>(gpuBlob.size())));
+
+ auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, llvmInt64Type,
+ mlir::IntegerAttr::get(
+ llvmInt64Type,
+ static_cast<int64_t>(launchOp.getNumKernelOperands())));
+
+ auto module =
+ moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize});
// Get the function from the module. The name corresponds to the name of
// the kernel function.
auto kernelName = generateKernelNameConstant(
@@ -1158,7 +1174,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
{function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
- /*extra=*/nullpointer});
+ /*extra=*/nullpointer, paramsCount});
if (launchOp.getAsyncToken()) {
// Async launch: make dependent ops use the same stream.
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1dba677ebe66365..8a53d99c778a63a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false;
#endif // MLIR_ENABLE_CUDA_CUSPARSELT
#endif // MLIR_ENABLE_CUDA_CUSPARSE
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
+mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
ScopedContext scopedContext;
CUmodule module = nullptr;
CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
@@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem, CUstream stream, void **params,
- void **extra) {
+ void **extra, size_t /*paramsCount*/) {
ScopedContext scopedContext;
int32_t maxShmem = 0;
CUdevice device = getDefaultCuDevice();
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..998ff5b8b829f88 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -32,7 +32,7 @@
thread_local static int32_t defaultDevice = 0;
-extern "C" hipModule_t mgpuModuleLoad(void *data) {
+extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
hipModule_t module = nullptr;
HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));
return module;
@@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
intptr_t blockX, intptr_t blockY,
intptr_t blockZ, int32_t smem,
hipStream_t stream, void **params,
- void **extra) {
+ void **extra, size_t /*paramsCount*/) {
HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
blockX, blockY, blockZ, smem,
stream, params, extra));
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index 439d31134aa8b9a..60ac27bd84e72fd 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -1,10 +1,14 @@
-//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===//
+//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the sycl runtime library.
+//
+//===----------------------------------------------------------------------===//
#include <algorithm>
#include <array>
@@ -32,7 +36,8 @@
namespace {
-template <typename F> auto catchAll(F &&func) {
+template <typename F>
+auto catchAll(F &&func) {
try {
return func();
} catch (const std::exception &e) {
@@ -58,46 +63,6 @@ template <typename F> auto catchAll(F &&func) {
} // namespace
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wglobal-constructors"
-
-struct SpirvModule {
- ze_module_handle_t module = nullptr;
- ~SpirvModule();
-};
-
-namespace {
-// Create a Map for the spirv module lookup
-std::map<void *, SpirvModule> moduleCache;
-std::mutex mutexLock;
-} // namespace
-
-SpirvModule::~SpirvModule() {
- L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
-}
-
-#pragma clang diagnostic pop
-
-struct ParamDesc {
- void *data;
- size_t size;
-
- bool operator==(const ParamDesc &rhs) const {
- return data == rhs.data && size == rhs.size;
- }
-
- bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
-};
-
-template <typename T> size_t countUntil(T *ptr, T &&elem) {
- assert(ptr);
- auto curr = ptr;
- while (*curr != elem) {
- ++curr;
- }
- return static_cast<size_t>(curr - ptr);
-}
-
static sycl::device getDefaultDevice() {
auto platformList = sycl::platform::get_platforms();
for (const auto &platform : platformList) {
@@ -108,74 +73,39 @@ static sycl::device getDefaultDevice() {
return platform.get_devices()[0];
}
+ throw std::runtime_error("getDefaultDevice failed");
}
-struct GPUSYCLQUEUE {
+// Create global device and context
+sycl::device syclDevice = getDefaultDevice();
+sycl::context syclContext = sycl::context(syclDevice);
- sycl::device syclDevice_;
- sycl::context syclContext_;
+struct QUEUE {
sycl::queue syclQueue_;
- GPUSYCLQUEUE(sycl::property_list propList) {
-
- syclDevice_ = getDefaultDevice();
- syclContext_ = sycl::context(syclDevice_);
- syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
- }
-
- GPUSYCLQUEUE(sycl::device *device, sycl::context *context,
- sycl::property_list propList) {
- syclDevice_ = *device;
- syclContext_ = *context;
- syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
- }
- GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) {
-
- syclDevice_ = *device;
- syclContext_ = sycl::context(syclDevice_);
- syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
- }
-
- GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) {
-
- syclDevice_ = getDefaultDevice();
- syclContext_ = *context;
- syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
- }
-
-}; // end of GPUSYCLQUEUE
+ QUEUE() { syclQueue_ = sycl::queue(syclContext, syclDevice); }
+};
-static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
- size_t alignment, bool isShared) {
+static void *allocDeviceMemory(QUEUE *queue, size_t size, bool isShared) {
void *memPtr = nullptr;
if (isShared) {
- memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_);
+ memPtr = sycl::aligned_alloc_shared(64, size, syclDevice, syclContext);
} else {
- memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_);
+ memPtr = sycl::aligned_alloc_device(64, size, syclDevice, syclContext);
}
if (memPtr == nullptr) {
- throw std::runtime_error(
- "aligned_alloc_shared() failed to allocate memory!");
+ throw std::runtime_error("mem allocation failed!");
}
return memPtr;
}
-static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+static void deallocDeviceMemory(QUEUE *queue, void *ptr) {
sycl::free(ptr, queue->syclQueue_);
}
-static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
- size_t dataSize) {
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
assert(data);
- auto syclQueue = queue->syclQueue_;
ze_module_handle_t zeModule;
-
- auto it = moduleCache.find((void *)data);
- // Check the map if the module is present/cached.
- if (it != moduleCache.end()) {
- return it->second.module;
- }
-
ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
nullptr,
ZE_MODULE_FORMAT_IL_SPIRV,
@@ -183,21 +113,17 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
(const uint8_t *)data,
nullptr,
nullptr};
- auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
- syclQueue.get_device());
- auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
- syclQueue.get_context());
+ auto zeDevice =
+ sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice);
+ auto zeContext =
+ sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext);
L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
- std::lock_guard<std::mutex> entryLock(mutexLock);
- moduleCache[(void *)data].module = zeModule;
return zeModule;
}
-static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
- const char *name) {
+static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) {
assert(zeModule);
assert(name);
- auto syclQueue = queue->syclQueue_;
ze_kernel_handle_t zeKernel;
sycl::kernel *syclKernel;
ze_kernel_desc_t desc = {};
@@ -206,186 +132,87 @@ static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
- sycl::bundle_state::executable>(
- {zeModule}, syclQueue.get_context());
+ sycl::bundle_state::executable>({zeModule},
+ syclContext);
auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
- {kernelBundle, zeKernel}, syclQueue.get_context());
+ {kernelBundle, zeKernel}, syclContext);
syclKernel = new sycl::kernel(kernel);
return syclKernel;
}
-static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel,
- sycl::nd_range<3> NdRange, ParamDesc *params,
- size_t sharedMemBytes) {
- auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
- // The assumption is, if there is a param for the shared local memory,
- // then that will always be the last argument.
- if (sharedMemBytes) {
- paramsCount = paramsCount - 1;
- }
- sycl::event event = queue.submit([&](sycl::handler &cgh) {
- for (size_t i = 0; i < paramsCount; i++) {
- auto param = params[i];
- cgh.set_arg(static_cast<uint32_t>(i),
- *(static_cast<void **>(param.data)));
- }
- if (sharedMemBytes) {
- // TODO: Handle other data types
- using share_mem_t =
- sycl::accessor<float, 1, sycl::access::mode::read_write,
- sycl::access::target::local>;
- share_mem_t local_buffer =
- share_mem_t(sharedMemBytes / sizeof(float), cgh);
- cgh.set_arg(paramsCount, local_buffer);
- cgh.parallel_for(NdRange, *kernel);
- } else {
- cgh.parallel_for(NdRange, *kernel);
- }
- });
- return event;
-}
-
-static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
- size_t gridX, size_t gridY, size_t gridZ,
- size_t blockX, size_t blockY, size_t blockZ,
- size_t sharedMemBytes, ParamDesc *params) {
- auto syclQueue = queue->syclQueue_;
+static void launchKernel(QUEUE *queue, sycl::kernel *kernel, size_t gridX,
+ size_t gridY, size_t gridZ, size_t blockX,
+ size_t blockY, size_t blockZ, size_t sharedMemBytes,
+ void **params, size_t paramsCount) {
auto syclGlobalRange =
::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
sycl::nd_range<3> syclNdRange(
sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
- if (getenv("IMEX_ENABLE_PROFILING")) {
- auto executionTime = 0.0f;
- auto maxTime = 0.0f;
- auto minTime = FLT_MAX;
- auto rounds = 100;
- auto warmups = 3;
-
- if (getenv("IMEX_PROFILING_RUNS")) {
- auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
- if (runs)
- rounds = runs;
- }
-
- if (getenv("IMEX_PROFILING_WARMUPS")) {
- auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
- if (warmups)
- warmups = runs;
- }
-
- // warmups
- for (int r = 0; r < warmups; r++) {
- enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
- }
-
- for (int r = 0; r < rounds; r++) {
- sycl::event event =
- enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
-
- auto startTime = event.get_profiling_info<
- cl::sycl::info::event_profiling::command_start>();
- auto endTime = event.get_profiling_info<
- cl::sycl::info::event_profiling::command_end>();
- auto gap = float(endTime - startTime) / 1000000.0f;
- executionTime += gap;
- if (gap > maxTime)
- maxTime = gap;
- if (gap < minTime)
- minTime = gap;
+ queue->syclQueue_.submit([&](sycl::handler &cgh) {
+ for (size_t i = 0; i < paramsCount; i++) {
+ cgh.set_arg(static_cast<uint32_t>(i), *(static_cast<void **>(params[i])));
}
-
- fprintf(stdout,
- "the kernel execution time is (ms):"
- "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
- executionTime / rounds, minTime, maxTime, rounds);
- } else {
- enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
- }
+ cgh.parallel_for(syclNdRange, *kernel);
+ });
}
// Wrappers
-extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
- void *context) {
- auto propList = sycl::property_list{};
- if (getenv("IMEX_ENABLE_PROFILING")) {
- propList = sycl::property_list{sycl::property::queue::enable_profiling()};
- }
- return catchAll([&]() {
- if (!device && !context) {
- return new GPUSYCLQUEUE(propList);
- } else if (device && context) {
- // TODO: Check if the pointers/address is valid and holds the correct
- // device and context
- return new GPUSYCLQUEUE(static_cast<sycl::device *>(device),
- static_cast<sycl::context *>(context), propList);
- } else if (device && !context) {
- return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList);
- } else {
- return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList);
- }
- });
+extern "C" SYCL_RUNTIME_EXPORT QUEUE *mgpuStreamCreate() {
+
+ return catchAll([&]() { return new QUEUE(); });
}
-extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(QUEUE *queue) {
catchAll([&]() { delete queue; });
}
-extern "C" SYCL_RUNTIME_EXPORT void *
-gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+extern "C" SYCL_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t size, QUEUE *queue,
+ bool isShared) {
return catchAll([&]() {
- if (queue) {
- return allocDeviceMemory(queue, size, alignment, isShared);
- }
+ return allocDeviceMemory(queue, static_cast<size_t>(size), true);
});
}
-extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, QUEUE *queue) {
catchAll([&]() {
- if (queue && ptr) {
+ if (ptr) {
deallocDeviceMemory(queue, ptr);
}
});
}
extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
-gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) {
- return catchAll([&]() {
- if (queue) {
- return loadModule(queue, data, dataSize);
- }
- });
+mgpuModuleLoad(const void *data, size_t gpuBlobSize) {
+ return catchAll([&]() { return loadModule(data, gpuBlobSize); });
}
extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
-gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) {
- return catchAll([&]() {
- if (queue) {
- return getKernel(queue, module, name);
- }
- });
+mgpuModuleGetFunction(ze_module_handle_t module, const char *name) {
+ return catchAll([&]() { return getKernel(module, name); });
}
extern "C" SYCL_RUNTIME_EXPORT void
-gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX,
- size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
- size_t blockZ, size_t sharedMemBytes, void *params) {
+mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ,
+ size_t blockX, size_t blockY, size_t blockZ,
+ size_t sharedMemBytes, QUEUE *queue, void **params,
+ void **extra, size_t paramsCount) {
return catchAll([&]() {
- if (queue) {
- launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
- sharedMemBytes, static_cast<ParamDesc *>(params));
- }
+ launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+ sharedMemBytes, params, paramsCount);
});
}
-extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(QUEUE *queue) {
- catchAll([&]() {
- if (queue) {
- queue->syclQueue_.wait();
- }
- });
+ catchAll([&]() { queue->syclQueue_.wait(); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuModuleUnload(ze_module_handle_t module) {
+
+ catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); });
}
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 2cdc4e8dbb1ad67..96e8a6dbd35b171 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -34,8 +34,10 @@ module attributes {gpu.container_module} {
// CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
// CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
// CHECK-SAME: -> !llvm.ptr
+ // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant
+ // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant
- // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]])
+ // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]])
// CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
// CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
@@ -53,10 +55,11 @@ module attributes {gpu.container_module} {
// CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
// CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr
+
// CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
// CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
- // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]])
+ // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]])
// CHECK: llvm.call @mgpuStreamSynchronize
// CHECK: llvm.call @mgpuStreamDestroy
// CHECK: llvm.call @mgpuModuleUnload
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
new file mode 100644
index 000000000000000..0be0e31a3d71e94
--- /dev/null
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -0,0 +1,50 @@
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+ memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+ %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+ %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+ %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+ call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+ return
+ }
+ func.func private @printMemrefI64(memref<*xi64>)
+ func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+ %c3 = arith.constant 3 : index
+ %c1 = arith.constant 1 : index
+ %0 = gpu.wait async
+ %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64>
+ gpu.wait [%asyncToken]
+ memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64>
+ %1 = gpu.wait async
+ %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64>
+ gpu.wait [%asyncToken_1]
+ memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+ %2 = gpu.wait async
+ %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64>
+ %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<3x3xi64>
+ memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+ %7 = gpu.dealloc async [%6] %memref : memref<3x3xi64>
+ gpu.wait [%7]
+ return %alloc : memref<3x3xi64>
+ }
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+ %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+ %4 = arith.addi %2, %3 : i64
+ memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+ gpu.return
+ }
+ }
+}
+
+
\ No newline at end of file
>From 66fe69f30c015c2619d840a4ef91b18219ebf7c7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 19:23:39 +0000
Subject: [PATCH 07/13] Temp save. Builds but Integration test fails.
---
mlir/test/CMakeLists.txt | 4 ++++
mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 11 +++++++++--
mlir/test/lit.cfg.py | 3 +++
mlir/test/lit.site.cfg.py.in | 1 +
4 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba5d..874e7718f4a36d1 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -135,6 +135,10 @@ if(MLIR_ENABLE_ROCM_RUNNER)
list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
endif()
+if(MLIR_ENABLE_SYCL_RUNNER)
+ list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
+endif()
+
list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
if(LLVM_BUILD_EXAMPLES)
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index 0be0e31a3d71e94..6ff9d4bf6ca8f5a 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -1,3 +1,10 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
module @add attributes {gpu.container_module} {
memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
@@ -34,7 +41,7 @@ module @add attributes {gpu.container_module} {
gpu.wait [%7]
return %alloc : memref<3x3xi64>
}
- gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%0 = gpu.block_id x
%1 = gpu.block_id y
@@ -47,4 +54,4 @@ module @add attributes {gpu.container_module} {
}
}
-
\ No newline at end of file
+
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index f265ac794c6f6db..5d3a4dc575a7b28 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -119,6 +119,9 @@ def add_runtime(name):
if config.enable_cuda_runner:
tools.extend([add_runtime("mlir_cuda_runtime")])
+if config.enable_sycl_runner:
+ tools.extend([add_runtime("mlir_sycl_runtime")])
+
# The following tools are optional
tools.extend(
[
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07c0..897c12f3abcac75 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -30,6 +30,7 @@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
>From b0758f0ed491b7264cbdd10fc4e82d280e5298f2 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.pate at intel.com>
Date: Wed, 6 Sep 2023 17:17:30 +0000
Subject: [PATCH 08/13] Update Sycl Runtime Wrappers
---
.../lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 12 ++++++++++--
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 3 ++-
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 3 ++-
.../GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir | 3 ++-
mlir/test/Conversion/GPUCommon/typed-pointers.mlir | 3 ++-
mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 8 +++-----
6 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 111cfbf93f26a9b..2b92c1cd8b00e50 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -169,7 +169,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
"mgpuMemAlloc",
llvmPointerType /* void * */,
{llvmIntPtrType /* intptr_t sizeBytes */,
- llvmPointerType /* void *stream */}};
+ llvmPointerType /* void *stream */,
+ llvmInt64Type /* size_t isHostShared */}};
FunctionCallBuilder deallocCallBuilder = {
"mgpuMemFree",
llvmVoidType,
@@ -801,6 +802,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
auto loc = allocOp.getLoc();
+ bool isShared = allocOp.getHostShared();
+
// Get shape of the memref as values: static sizes are constant
// values and dynamic sizes are passed to 'alloc' as operands.
SmallVector<Value, 4> shape;
@@ -813,8 +816,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
// descriptor.
Type elementPtrType = this->getElementPtrType(memRefType);
auto stream = adaptor.getAsyncDependencies().front();
+
+ auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+
Value allocatedPtr =
- allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+ allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+ .getResult();
if (!getTypeConverter()->useOpaquePointers())
allocatedPtr =
rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 8a53d99c778a63a..79dc2eed38f06a9 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -211,7 +211,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
}
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+ bool /*isHostShared*/) {
ScopedContext scopedContext;
CUdeviceptr ptr;
CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 998ff5b8b829f88..b50fd7eb9d05929 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
}
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+ bool /*isHostShared*/) {
void *ptr;
HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));
return ptr;
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 2506c6ceb990ef5..f365dcb02daf4c2 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
%0 = gpu.wait async
// CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
- // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+ // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
%1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
// CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
// CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
index 2fa6c854c567819..e27162c7dbc1902 100644
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
%0 = gpu.wait async
// CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
- // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+ // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
%1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
// CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
// CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index 6ff9d4bf6ca8f5a..b6180d322ce9329 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -21,15 +21,15 @@ module @add attributes {gpu.container_module} {
%c3 = arith.constant 3 : index
%c1 = arith.constant 1 : index
%0 = gpu.wait async
- %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64>
+ %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64>
gpu.wait [%asyncToken]
memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64>
%1 = gpu.wait async
- %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64>
+ %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<3x3xi64>
gpu.wait [%asyncToken_1]
memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
%2 = gpu.wait async
- %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64>
+ %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<3x3xi64>
%3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
gpu.wait [%3]
%alloc = memref.alloc() : memref<3x3xi64>
@@ -53,5 +53,3 @@ module @add attributes {gpu.container_module} {
}
}
}
-
-
>From c37e8d84ea6a9ec84b27ffe30f47b6acfbbda202 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 20:35:53 +0000
Subject: [PATCH 09/13] Add f32 integration test.
---
mlir/test/Integration/GPU/SYCL/addf.mlir | 58 ++++++++++++++++++++++++
1 file changed, 58 insertions(+)
create mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir
diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
new file mode 100644
index 000000000000000..b9fc36547220e77
--- /dev/null
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {
+ gpu.container_module,
+ spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+} {
+ memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
+ memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_9xf32 : memref<9xf32>
+ %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
+ %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
+ %cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
+ call @printMemrefI64(%cast) : (memref<*xf32>) -> ()
+ return
+ }
+ func.func private @printMemrefI64(memref<*xf32>)
+ func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
+ %c9 = arith.constant 9 : index
+ %c1 = arith.constant 1 : index
+ %0 = gpu.wait async
+ %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32>
+ gpu.wait [%asyncToken]
+ memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32>
+ %1 = gpu.wait async
+ %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32>
+ gpu.wait [%asyncToken_1]
+ memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32>
+ %2 = gpu.wait async
+ %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32>
+ %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>)
+ gpu.wait [%3]
+ %alloc = memref.alloc() : memref<9xf32>
+ memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32>
+ %4 = gpu.wait async
+ %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32>
+ %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32>
+ %7 = gpu.dealloc async [%6] %memref : memref<9xf32>
+ gpu.wait [%7]
+ return %alloc : memref<9xf32>
+ }
+ gpu.module @test_kernel {
+ gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+ //%0 = gpu.block_id x
+ //%2 = memref.load %arg0[%0] : memref<9xf32>
+ //%3 = memref.load %arg1[%0] : memref<9xf32>
+ //%4 = arith.addf %2, %3 : f32
+ //memref.store %4, %arg2[%0] : memref<9xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3]
+}
>From 84d584d5ba21a8177eca4818a076daf1d471b38c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:26:30 +0000
Subject: [PATCH 10/13] Add new option to convert-gpu-to-spirv pass to handle
OpenCL
---
mlir/include/mlir/Conversion/Passes.td | 5 ++++-
mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 3 ++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ed37abf85275bf3..3bb6006a467fe37 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -568,7 +568,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
let options = [
Option<"use64bitIndex", "use-64bit-index",
"bool", /*default=*/"false",
- "Use 64-bit integers to convert index types">
+ "Use 64-bit integers to convert index types">,
+ Option<"useOpenCL", "use-opencl",
+ "bool", /*default=*/"false",
+ "Use OpenCL instead of Vulkan">
];
}
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index f37c70a771f5916..a52c99ec9daec16 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -71,7 +71,8 @@ void GPUToSPIRVPass::runOnOperation() {
std::unique_ptr<ConversionTarget> target =
spirv::getMemorySpaceToStorageClassTarget(*context);
spirv::MemorySpaceToStorageClassMap memorySpaceMap =
- spirv::mapMemorySpaceToVulkanStorageClass;
+ this->useOpenCL ? spirv::mapMemorySpaceToOpenCLStorageClass :
+ spirv::mapMemorySpaceToVulkanStorageClass;
spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap);
RewritePatternSet patterns(context);
>From 2f8ab53057dd7a1433bc4e867a265ee4a215bb2b Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:27:10 +0000
Subject: [PATCH 11/13] Update pass pipeline for integration tests.
---
mlir/test/Integration/GPU/SYCL/addf.mlir | 2 +-
mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
index b9fc36547220e77..984b1e0ae528723 100644
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_sycl_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index b6180d322ce9329..c52e723af68f32b 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_sycl_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
>From 41bc1125acb049e655aa9c934b8e3da1f64cc26f Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:55:10 +0000
Subject: [PATCH 12/13] Fix.
---
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 4 ----
mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp | 5 +++++
mlir/test/Integration/GPU/SYCL/addf.mlir | 11 ++++++-----
mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 3 +++
4 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 2b92c1cd8b00e50..808431b82472471 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -789,10 +789,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::AllocOp allocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
- if (adaptor.getHostShared())
- return rewriter.notifyMatchFailure(
- allocOp, "host_shared allocation is not supported");
-
MemRefType memRefType = allocOp.getType();
if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index 60ac27bd84e72fd..6b40d4a6922c9f0 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -76,10 +76,15 @@ static sycl::device getDefaultDevice() {
throw std::runtime_error("getDefaultDevice failed");
}
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+
// Create global device and context
sycl::device syclDevice = getDefaultDevice();
sycl::context syclContext = sycl::context(syclDevice);
+#pragma clang diagnostic pop
+
struct QUEUE {
sycl::queue syclQueue_;
diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
index 984b1e0ae528723..d4e00ddedf3877c 100644
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -6,8 +6,7 @@
// RUN: | FileCheck %s
module @add attributes {
- gpu.container_module,
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+ gpu.container_module
} {
memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
@@ -16,10 +15,10 @@ module @add attributes {
%1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
%2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
%cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
- call @printMemrefI64(%cast) : (memref<*xf32>) -> ()
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
return
}
- func.func private @printMemrefI64(memref<*xf32>)
+ func.func private @printMemrefF32(memref<*xf32>)
func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
%c9 = arith.constant 9 : index
%c1 = arith.constant 1 : index
@@ -44,7 +43,9 @@ module @add attributes {
gpu.wait [%7]
return %alloc : memref<9xf32>
}
- gpu.module @test_kernel {
+ gpu.module @test_kernel attributes {
+ spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+ }{
gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
//%0 = gpu.block_id x
//%2 = memref.load %arg0[%0] : memref<9xf32>
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index c52e723af68f32b..36d132d0c94d32e 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -52,4 +52,7 @@ module @add attributes {gpu.container_module} {
gpu.return
}
}
+ // CHECK: [2, 4100, 6],
+ // CHECK: [16777224, 10, 4294971404],
+ // CHECK: [16777230, 1103806595088, 1099511627794]
}
>From aae8d757874239e9362eb60ea0eebfd572a0303f Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:56:16 +0000
Subject: [PATCH 13/13] Remove unneeded test.
---
mlir/test/Integration/GPU/SYCL/addf.mlir | 59 ------------------------
1 file changed, 59 deletions(-)
delete mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir
diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
deleted file mode 100644
index d4e00ddedf3877c..000000000000000
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_sycl_runtime \
-// RUN: --shared-libs=%mlir_runner_utils \
-// RUN: --entry-point-result=void \
-// RUN: | FileCheck %s
-
-module @add attributes {
- gpu.container_module
-} {
- memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
- memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
- func.func @main() {
- %0 = memref.get_global @__constant_9xf32 : memref<9xf32>
- %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
- %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
- %cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
- call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
- return
- }
- func.func private @printMemrefF32(memref<*xf32>)
- func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
- %c9 = arith.constant 9 : index
- %c1 = arith.constant 1 : index
- %0 = gpu.wait async
- %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32>
- gpu.wait [%asyncToken]
- memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32>
- %1 = gpu.wait async
- %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32>
- gpu.wait [%asyncToken_1]
- memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32>
- %2 = gpu.wait async
- %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32>
- %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>)
- gpu.wait [%3]
- %alloc = memref.alloc() : memref<9xf32>
- memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32>
- %4 = gpu.wait async
- %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32>
- %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32>
- %7 = gpu.dealloc async [%6] %memref : memref<9xf32>
- gpu.wait [%7]
- return %alloc : memref<9xf32>
- }
- gpu.module @test_kernel attributes {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
- }{
- gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
- //%0 = gpu.block_id x
- //%2 = memref.load %arg0[%0] : memref<9xf32>
- //%3 = memref.load %arg1[%0] : memref<9xf32>
- //%4 = arith.addf %2, %3 : f32
- //memref.store %4, %arg2[%0] : memref<9xf32>
- gpu.return
- }
- }
- // CHECK: [3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3]
-}
More information about the flang-commits
mailing list