[Openmp-commits] [openmp] Enabling Intel GPU Integration. (PR #65539)

Wed Sep 6 15:02:16 PDT 2023

https://github.com/silee2 created https://github.com/llvm/llvm-project/pull/65539:

None

>From 863a72b4e099f4aa24e43fdaaeb2ab0e171a0381 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 30 Aug 2023 13:44:02 -0700
Subject: [PATCH 01/13] Add SyclRuntimeWrappers and Add CMake option
 MLIR_ENABLE_SYCL_RUNNER

---
 mlir/CMakeLists.txt                           |   1 +
 mlir/cmake/modules/FindLevelZero.cmake        | 221 ++++++++++
 mlir/cmake/modules/FindSyclRuntime.cmake      |  68 +++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  35 ++
 .../ExecutionEngine/SyclRuntimeWrappers.cpp   | 386 ++++++++++++++++++
 5 files changed, 711 insertions(+)
 create mode 100644 mlir/cmake/modules/FindLevelZero.cmake
 create mode 100644 mlir/cmake/modules/FindSyclRuntime.cmake
 create mode 100644 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index fa4f6e76f985fb5..4a67e018273819f 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -116,6 +116,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS})
 
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner")
+set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner")
 set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner")
 set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake
new file mode 100644
index 000000000000000..012187f0afc0b07
--- /dev/null
+++ b/mlir/cmake/modules/FindLevelZero.cmake
@@ -0,0 +1,221 @@
+# CMake find_package() module for level-zero
+#
+# Example usage:
+#
+# find_package(LevelZero)
+#
+# If successful, the following variables will be defined:
+# LevelZero_FOUND
+# LevelZero_INCLUDE_DIRS
+# LevelZero_LIBRARY
+# LevelZero_LIBRARIES_DIR
+#
+# By default, the module searches the standard paths to locate the "ze_api.h"
+# and the ze_loader shared library. When using a custom level-zero installation,
+# the environment variable "LEVEL_ZERO_DIR" should be specified telling the
+# module to get the level-zero library and headers from that location.
+
+include(FindPackageHandleStandardArgs)
+
+# Search path priority
+# 1. CMake Variable LEVEL_ZERO_DIR
+# 2. Environment Variable LEVEL_ZERO_DIR
+
+if(NOT LEVEL_ZERO_DIR)
+    if(DEFINED ENV{LEVEL_ZERO_DIR})
+        set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
+    endif()
+endif()
+
+if(LEVEL_ZERO_DIR)
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+        PATHS ${LEVEL_ZERO_DIR}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(LINUX)
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+                  ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+            NO_DEFAULT_PATH
+        )
+    else()
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+            NO_DEFAULT_PATH
+        )
+    endif()
+else()
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+    )
+
+    find_library(LevelZero_LIBRARY
+        NAMES ze_loader
+    )
+endif()
+
+# Compares the two version string that are supposed to be in x.y.z format
+# and reports if the argument VERSION_STR1 is greater than or equal than
+# version_str2. The strings are compared lexicographically after conversion to
+# lists of equal lengths, with the shorter string getting zero-padded.
+function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
+    # Convert the strings to list
+    string(REPLACE  "." ";" VL1 ${VERSION_STR1})
+    string(REPLACE  "." ";" VL2 ${VERSION_STR2})
+    # get lengths of both lists
+    list(LENGTH VL1 VL1_LEN)
+    list(LENGTH VL2 VL2_LEN)
+    set(LEN ${VL1_LEN})
+    # If they differ in size pad the shorter list with 0s
+    if(VL1_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+    elseif(VL2_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+        set(LEN ${VL2_LEN})
+    endif()
+    math(EXPR LEN_SUB_ONE "${LEN}-1")
+    foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
+        list(GET VL1 ${IDX} VAL1)
+        list(GET VL2 ${IDX} VAL2)
+
+        if(${VAL1} GREATER ${VAL2})
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+            break()
+        elseif(${VAL1} LESS ${VAL2})
+            set(${OUTPUT} FALSE PARENT_SCOPE)
+            break()
+        else()
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+        endif()
+    endforeach()
+
+    endfunction(compare_versions)
+
+# Creates a small function to run and extract the LevelZero loader version.
+function(get_l0_loader_version)
+
+    set(L0_VERSIONEER_SRC
+        [====[
+        #include <iostream>
+        #include <level_zero/loader/ze_loader.h>
+        #include <string>
+        int main() {
+            ze_result_t result;
+            std::string loader("loader");
+            zel_component_version_t *versions;
+            size_t size = 0;
+            result = zeInit(0);
+            if (result != ZE_RESULT_SUCCESS) {
+                std::cerr << "Failed to init ze driver" << std::endl;
+                return -1;
+            }
+            zelLoaderGetVersions(&size, nullptr);
+            versions = new zel_component_version_t[size];
+            zelLoaderGetVersions(&size, versions);
+            for (size_t i = 0; i < size; i++) {
+                if (loader.compare(versions[i].component_name) == 0) {
+                    std::cout << versions[i].component_lib_version.major << "."
+                              << versions[i].component_lib_version.minor << "."
+                              << versions[i].component_lib_version.patch;
+                    break;
+                }
+            }
+            delete[] versions;
+            return 0;
+        }
+        ]====]
+    )
+
+    set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp)
+
+    file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}")
+
+    # We need both the directories in the include path as ze_loader.h
+    # includes "ze_api.h" and not "level_zero/ze_api.h".
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+    list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
+    try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
+            "${CMAKE_BINARY_DIR}"
+            "${L0_VERSIONEER_FILE}"
+            LINK_LIBRARIES ${LevelZero_LIBRARY}
+            CMAKE_FLAGS
+                "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+            RUN_OUTPUT_VARIABLE L0_VERSION
+    )
+    if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
+        set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+        message(STATUS "Found Level Zero of version: ${L0_VERSION}")
+    else()
+        message(FATAL_ERROR
+            "Could not compile a level-zero program to extract loader version"
+        )
+    endif()
+endfunction(get_l0_loader_version)
+
+if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
+    list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
+    list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    if(OpenCL_FOUND)
+      list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+    endif()
+
+    cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
+    set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
+
+    if(NOT TARGET LevelZero::LevelZero)
+      add_library(LevelZero::LevelZero INTERFACE IMPORTED)
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
+      )
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
+      )
+    endif()
+endif()
+
+# Check if a specific version of Level Zero is required
+if(LevelZero_FIND_VERSION)
+    get_l0_loader_version()
+    set(VERSION_GT_FIND_VERSION FALSE)
+    compare_versions(
+        ${LevelZero_VERSION}
+        ${LevelZero_FIND_VERSION}
+        VERSION_GT_FIND_VERSION
+    )
+    if(${VERSION_GT_FIND_VERSION})
+        set(LevelZero_FOUND TRUE)
+    else()
+        set(LevelZero_FOUND FALSE)
+    endif()
+else()
+    set(LevelZero_FOUND TRUE)
+endif()
+
+find_package_handle_standard_args(LevelZero
+    REQUIRED_VARS
+        LevelZero_FOUND
+        LevelZero_INCLUDE_DIRS
+        LevelZero_LIBRARY
+        LevelZero_LIBRARIES_DIR
+    HANDLE_COMPONENTS
+)
+mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+
+if(LevelZero_FOUND)
+    find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
+        "(found version ${LevelZero_VERSION})"
+    )
+else()
+    find_package_message(LevelZero "Could not find LevelZero" "")
+endif()
diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake
new file mode 100644
index 000000000000000..38b065a3f284c2c
--- /dev/null
+++ b/mlir/cmake/modules/FindSyclRuntime.cmake
@@ -0,0 +1,68 @@
+# CMake find_package() module for SYCL Runtime
+#
+# Example usage:
+#
+# find_package(SyclRuntime)
+#
+# If successful, the following variables will be defined:
+# SyclRuntime_FOUND
+# SyclRuntime_INCLUDE_DIRS
+# SyclRuntime_LIBRARY
+# SyclRuntime_LIBRARIES_DIR
+#
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT DEFINED ENV{CMPLR_ROOT})
+    message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)")
+    message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers")
+else()
+    if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux"))
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux")
+    elseif(WIN32)
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows")
+    endif()
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include")
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl")
+
+    set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib")
+
+    message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}")
+    find_library(SyclRuntime_LIBRARY
+        NAMES sycl
+        PATHS ${SyclRuntime_LIBRARY_DIR}
+        NO_DEFAULT_PATH
+        )
+endif()
+
+if(SyclRuntime_LIBRARY)
+    set(SyclRuntime_FOUND TRUE)
+    if(NOT TARGET SyclRuntime::SyclRuntime)
+        add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED)
+        set_target_properties(SyclRuntime::SyclRuntime
+            PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}"
+      )
+      set_target_properties(SyclRuntime::SyclRuntime
+          PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}"
+      )
+    endif()
+else()
+    set(SyclRuntime_FOUND FALSE)
+endif()
+
+find_package_handle_standard_args(SyclRuntime
+    REQUIRED_VARS
+        SyclRuntime_FOUND
+        SyclRuntime_INCLUDE_DIRS
+        SyclRuntime_LIBRARY
+        SyclRuntime_LIBRARY_DIR
+    HANDLE_COMPONENTS
+)
+
+mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS)
+
+if(SyclRuntime_FOUND)
+    find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "")
+else()
+    find_package_message(SyclRuntime "Could not find SyclRuntime" "")
+endif()
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index ea33c2c6ed261e1..47b1e82d60ff03f 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -328,4 +328,39 @@ if(LLVM_ENABLE_PIC)
       hip::host hip::amdhip64
     )
   endif()
+
+  if(MLIR_ENABLE_SYCL_RUNNER)
+    find_package(SyclRuntime)
+
+    if(NOT SyclRuntime_FOUND)
+      message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
+    endif()
+
+    find_package(LevelZero)
+
+    if(NOT LevelZero_FOUND)
+      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+    endif()
+
+    add_mlir_library(sycl-runtime
+      SHARED
+      SyclRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+
+    check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+    if(NOT CXX_HAS_FRTTI_FLAG)
+      message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+    endif()
+    target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti)
+
+    target_include_directories(sycl-runtime PRIVATE
+      ${MLIR_INCLUDE_DIRS}
+    )
+
+    target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+
+    set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+  endif()
 endif()
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
new file mode 100644
index 000000000000000..ed5ed2170f411c3
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -0,0 +1,386 @@
+//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cfloat>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <level_zero/ze_api.h>
+#include <map>
+#include <mutex>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+
+#ifdef _WIN32
+#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define SYCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+template <typename F> auto catchAll(F &&func) {
+  try {
+    return func();
+  } catch (const std::exception &e) {
+    fprintf(stdout, "An exception was thrown: %s\n", e.what());
+    fflush(stdout);
+    abort();
+  } catch (...) {
+    fprintf(stdout, "An unknown exception was thrown\n");
+    fflush(stdout);
+    abort();
+  }
+}
+
+#define L0_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    ze_result_t status = (call);                                               \
+    if (status != ZE_RESULT_SUCCESS) {                                         \
+      fprintf(stdout, "L0 error %d\n", status);                                \
+      fflush(stdout);                                                          \
+      abort();                                                                 \
+    }                                                                          \
+  }
+
+} // namespace
+
+struct SpirvModule {
+  ze_module_handle_t module = nullptr;
+  ~SpirvModule();
+};
+
+namespace {
+// Create a Map for the spirv module lookup
+std::map<void *, SpirvModule> moduleCache;
+std::mutex mutexLock;
+} // namespace
+
+SpirvModule::~SpirvModule() {
+  L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
+}
+
+struct ParamDesc {
+  void *data;
+  size_t size;
+
+  bool operator==(const ParamDesc &rhs) const {
+    return data == rhs.data && size == rhs.size;
+  }
+
+  bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
+};
+
+template <typename T> size_t countUntil(T *ptr, T &&elem) {
+  assert(ptr);
+  auto curr = ptr;
+  while (*curr != elem) {
+    ++curr;
+  }
+  return static_cast<size_t>(curr - ptr);
+}
+
+static sycl::device getDefaultDevice() {
+  auto platformList = sycl::platform::get_platforms();
+  for (const auto &platform : platformList) {
+    auto platformName = platform.get_info<sycl::info::platform::name>();
+    bool isLevelZero = platformName.find("Level-Zero") != std::string::npos;
+    if (!isLevelZero)
+      continue;
+
+    return platform.get_devices()[0];
+  }
+}
+
+struct GPUSYCLQUEUE {
+
+  sycl::device syclDevice_;
+  sycl::context syclContext_;
+  sycl::queue syclQueue_;
+
+  GPUSYCLQUEUE(sycl::property_list propList) {
+
+    syclDevice_ = getDefaultDevice();
+    syclContext_ = sycl::context(syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+  }
+
+  GPUSYCLQUEUE(sycl::device *device, sycl::context *context,
+               sycl::property_list propList) {
+    syclDevice_ = *device;
+    syclContext_ = *context;
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+  }
+  GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) {
+
+    syclDevice_ = *device;
+    syclContext_ = sycl::context(syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+  }
+
+  GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) {
+
+    syclDevice_ = getDefaultDevice();
+    syclContext_ = *context;
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
+  }
+
+}; // end of GPUSYCLQUEUE
+
+static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
+                               size_t alignment, bool isShared) {
+  void *memPtr = nullptr;
+  if (isShared) {
+    memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_);
+  } else {
+    memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_);
+  }
+  if (memPtr == nullptr) {
+    throw std::runtime_error(
+        "aligned_alloc_shared() failed to allocate memory!");
+  }
+  return memPtr;
+}
+
+static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+  sycl::free(ptr, queue->syclQueue_);
+}
+
+static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
+                                     size_t dataSize) {
+  assert(data);
+  auto syclQueue = queue->syclQueue_;
+  ze_module_handle_t zeModule;
+
+  auto it = moduleCache.find((void *)data);
+  // Check the map if the module is present/cached.
+  if (it != moduleCache.end()) {
+    return it->second.module;
+  }
+
+  ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+                           nullptr,
+                           ZE_MODULE_FORMAT_IL_SPIRV,
+                           dataSize,
+                           (const uint8_t *)data,
+                           nullptr,
+                           nullptr};
+  auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+      syclQueue.get_device());
+  auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+      syclQueue.get_context());
+  L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
+  std::lock_guard<std::mutex> entryLock(mutexLock);
+  moduleCache[(void *)data].module = zeModule;
+  return zeModule;
+}
+
+static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
+                               const char *name) {
+  assert(zeModule);
+  assert(name);
+  auto syclQueue = queue->syclQueue_;
+  ze_kernel_handle_t zeKernel;
+  sycl::kernel *syclKernel;
+  ze_kernel_desc_t desc = {};
+  desc.pKernelName = name;
+
+  L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
+  sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
+      sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
+                               sycl::bundle_state::executable>(
+          {zeModule}, syclQueue.get_context());
+
+  auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+      {kernelBundle, zeKernel}, syclQueue.get_context());
+  syclKernel = new sycl::kernel(kernel);
+  return syclKernel;
+}
+
+static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel,
+                                 sycl::nd_range<3> NdRange, ParamDesc *params,
+                                 size_t sharedMemBytes) {
+  auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
+  // The assumption is, if there is a param for the shared local memory,
+  // then that will always be the last argument.
+  if (sharedMemBytes) {
+    paramsCount = paramsCount - 1;
+  }
+  sycl::event event = queue.submit([&](sycl::handler &cgh) {
+    for (size_t i = 0; i < paramsCount; i++) {
+      auto param = params[i];
+      cgh.set_arg(static_cast<uint32_t>(i),
+                  *(static_cast<void **>(param.data)));
+    }
+    if (sharedMemBytes) {
+      // TODO: Handle other data types
+      using share_mem_t =
+          sycl::accessor<float, 1, sycl::access::mode::read_write,
+                         sycl::access::target::local>;
+      share_mem_t local_buffer =
+          share_mem_t(sharedMemBytes / sizeof(float), cgh);
+      cgh.set_arg(paramsCount, local_buffer);
+      cgh.parallel_for(NdRange, *kernel);
+    } else {
+      cgh.parallel_for(NdRange, *kernel);
+    }
+  });
+  return event;
+}
+
+static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
+                         size_t gridX, size_t gridY, size_t gridZ,
+                         size_t blockX, size_t blockY, size_t blockZ,
+                         size_t sharedMemBytes, ParamDesc *params) {
+  auto syclQueue = queue->syclQueue_;
+  auto syclGlobalRange =
+      ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
+  auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
+  sycl::nd_range<3> syclNdRange(
+      sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
+
+  if (getenv("IMEX_ENABLE_PROFILING")) {
+    auto executionTime = 0.0f;
+    auto maxTime = 0.0f;
+    auto minTime = FLT_MAX;
+    auto rounds = 100;
+    auto warmups = 3;
+
+    if (getenv("IMEX_PROFILING_RUNS")) {
+      auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
+      if (runs)
+        rounds = runs;
+    }
+
+    if (getenv("IMEX_PROFILING_WARMUPS")) {
+      auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
+      if (warmups)
+        warmups = runs;
+    }
+
+    // warmups
+    for (int r = 0; r < warmups; r++) {
+      enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+    }
+
+    for (int r = 0; r < rounds; r++) {
+      sycl::event event =
+          enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+
+      auto startTime = event.get_profiling_info<
+          cl::sycl::info::event_profiling::command_start>();
+      auto endTime = event.get_profiling_info<
+          cl::sycl::info::event_profiling::command_end>();
+      auto gap = float(endTime - startTime) / 1000000.0f;
+      executionTime += gap;
+      if (gap > maxTime)
+        maxTime = gap;
+      if (gap < minTime)
+        minTime = gap;
+    }
+
+    fprintf(stdout,
+            "the kernel execution time is (ms):"
+            "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
+            executionTime / rounds, minTime, maxTime, rounds);
+  } else {
+    enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
+  }
+}
+
+// Wrappers
+
+extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
+                                                             void *context) {
+  auto propList = sycl::property_list{};
+  if (getenv("IMEX_ENABLE_PROFILING")) {
+    propList = sycl::property_list{sycl::property::queue::enable_profiling()};
+  }
+  return catchAll([&]() {
+    if (!device && !context) {
+      return new GPUSYCLQUEUE(propList);
+    } else if (device && context) {
+      // TODO: Check if the pointers/address is valid and holds the correct
+      // device and context
+      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device),
+                              static_cast<sycl::context *>(context), propList);
+    } else if (device && !context) {
+      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList);
+    } else {
+      return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
+  catchAll([&]() { delete queue; });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void *
+gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+  return catchAll([&]() {
+    if (queue) {
+      return allocDeviceMemory(queue, size, alignment, isShared);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
+  catchAll([&]() {
+    if (queue && ptr) {
+      deallocDeviceMemory(queue, ptr);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
+gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) {
+  return catchAll([&]() {
+    if (queue) {
+      return loadModule(queue, data, dataSize);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
+gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) {
+  return catchAll([&]() {
+    if (queue) {
+      return getKernel(queue, module, name);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX,
+                size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
+                size_t blockZ, size_t sharedMemBytes, void *params) {
+  return catchAll([&]() {
+    if (queue) {
+      launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                   sharedMemBytes, static_cast<ParamDesc *>(params));
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
+
+  catchAll([&]() {
+    if (queue) {
+      queue->syclQueue_.wait();
+    }
+  });
+}

>From 19bc391f87aba81196c0f4233b2a9ab808ade282 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 30 Aug 2023 20:59:43 +0000
Subject: [PATCH 02/13] Fix config and build issues.

---
 mlir/lib/ExecutionEngine/CMakeLists.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 47b1e82d60ff03f..101d9baafcccc49 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES
   CRunnerUtils.cpp
   CudaRuntimeWrappers.cpp
   SparseTensorRuntime.cpp
+  SyclRuntimeWrappers.cpp
   ExecutionEngine.cpp
   Float16bits.cpp
   RocmRuntimeWrappers.cpp
@@ -342,7 +343,7 @@ if(LLVM_ENABLE_PIC)
       message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
     endif()
 
-    add_mlir_library(sycl-runtime
+    add_mlir_library(mlir_sycl_runtime
       SHARED
       SyclRuntimeWrappers.cpp
 
@@ -353,14 +354,14 @@ if(LLVM_ENABLE_PIC)
     if(NOT CXX_HAS_FRTTI_FLAG)
       message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
     endif()
-    target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti)
+    target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
 
-    target_include_directories(sycl-runtime PRIVATE
+    target_include_directories(mlir_sycl_runtime PRIVATE
       ${MLIR_INCLUDE_DIRS}
     )
 
-    target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+    target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
 
-    set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+    set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
   endif()
 endif()

>From 330d04db7155bbf416ee422934b47eac3dea70ad Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Aug 2023 15:09:11 -0700
Subject: [PATCH 03/13] Suppress clang compiler error.

---
 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index ed5ed2170f411c3..439d31134aa8b9a 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -58,6 +58,9 @@ template <typename F> auto catchAll(F &&func) {
 
 } // namespace
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+
 struct SpirvModule {
   ze_module_handle_t module = nullptr;
   ~SpirvModule();
@@ -73,6 +76,8 @@ SpirvModule::~SpirvModule() {
   L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
 }
 
+#pragma clang diagnostic pop
+
 struct ParamDesc {
   void *data;
   size_t size;

>From 203d23eefe5a32f4c13313579bd1d9b9630e2413 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 09:01:06 -0700
Subject: [PATCH 04/13] Add gpu serialize to spirv pass.

---
 .../mlir/Dialect/GPU/Transforms/Passes.td     |  4 ++
 mlir/lib/Dialect/GPU/CMakeLists.txt           |  1 +
 .../GPU/Transforms/SerializeToSPIRV.cpp       | 70 +++++++++++++++++++
 .../GPU/Transforms/serialize-spirv.mlir       | 53 ++++++++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
 create mode 100644 mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index fc20bd2ed921aea..f285f45448ecc7e 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -187,4 +187,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> {
   ];
 }
 
+def GpuSerializeToSPIRVPass : Pass<"gpu-serialize-to-spirv", "ModuleOp"> {
+  let summary = "Serialize spirv dialect to spirv binary";
+}
+
 #endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6244132c073a4a6..e2c1dc7adf646f7 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
   Transforms/SerializeToHsaco.cpp
+  Transforms/SerializeToSPIRV.cpp
   Transforms/ShuffleRewriter.cpp
   Transforms/ROCDLAttachTarget.cpp
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
new file mode 100644
index 000000000000000..f013f531371de86
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
@@ -0,0 +1,70 @@
+//===- SerializeToSPIRV.cpp - Convert GPU kernel to SPIRV blob -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This pass iterates all the SPIR-V modules in the top module and serializes
+/// each SPIR-V module to SPIR-V binary and then attachs the binary blob as a
+/// string attribute to the corresponding gpu module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Target/SPIRV/Serialization.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_GPUSERIALIZETOSPIRVPASS
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+struct GpuSerializeToSPIRVPass : public mlir::impl::GpuSerializeToSPIRVPassBase<GpuSerializeToSPIRVPass> {
+public:
+  void runOnOperation() override {
+    auto mod = getOperation();
+    llvm::SmallVector<uint32_t, 0> spvBinary;
+    for (mlir::gpu::GPUModuleOp gpuMod : mod.getOps<gpu::GPUModuleOp>()) {
+      auto name = gpuMod.getName();
+      // check that the spv module has the same name with gpu module except the
+      // prefix "__spv__"
+      auto isSameMod = [&](spirv::ModuleOp spvMod) -> bool {
+        auto spvModName = spvMod.getName();
+        return spvModName->consume_front("__spv__") && spvModName == name;
+      };
+      auto spvMods = mod.getOps<spirv::ModuleOp>();
+      auto it = llvm::find_if(spvMods, isSameMod);
+      if (it == spvMods.end()) {
+        gpuMod.emitError() << "Unable to find corresponding SPIR-V module";
+        signalPassFailure();
+        return;
+      }
+      auto spvMod = *it;
+
+      spvBinary.clear();
+      // serialize the spv module to spv binary
+      if (mlir::failed(spirv::serialize(spvMod, spvBinary))) {
+        spvMod.emitError() << "Failed to serialize SPIR-V module";
+        signalPassFailure();
+        return;
+      }
+
+      // attach the spv binary to the gpu module
+      auto spvData =
+          llvm::StringRef(reinterpret_cast<const char *>(spvBinary.data()),
+                          spvBinary.size() * sizeof(uint32_t));
+      auto spvAttr = mlir::StringAttr::get(&getContext(), spvData);
+      gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr);
+      spvMod->erase();
+    }
+  }
+};
diff --git a/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
new file mode 100644
index 000000000000000..d70e18f3401d38d
--- /dev/null
+++ b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt -gpu-serialize-to-spirv %s | FileCheck %s
+module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, #spirv.resource_limits<>>} {
+  // CHECK:        gpu.module @addt_kernel attributes {gpu.binary =
+  spirv.module @__spv__addt_kernel Physical64 OpenCL requires #spirv.vce<v1.0, [Int64, Addresses, Kernel], []> {
+    spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input>
+    spirv.func @addt_kernel(%arg0: !spirv.ptr<f32, CrossWorkgroup>, %arg1: !spirv.ptr<f32, CrossWorkgroup>, %arg2: !spirv.ptr<f32, CrossWorkgroup>) "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64} {
+      %cst5_i64 = spirv.Constant 5 : i64
+      %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64>
+      %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64>
+      %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64>
+      %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64>
+      spirv.Branch ^bb1
+    ^bb1:  // pred: ^bb0
+      %4 = spirv.IMul %1, %cst5_i64 : i64
+      %5 = spirv.IAdd %4, %3 : i64
+      %6 = spirv.InBoundsPtrAccessChain %arg0[%5] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      %7 = spirv.Load "CrossWorkgroup" %6 ["Aligned", 4] : f32
+      %8 = spirv.IMul %1, %cst5_i64 : i64
+      %9 = spirv.IAdd %8, %3 : i64
+      %10 = spirv.InBoundsPtrAccessChain %arg1[%9] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      %11 = spirv.Load "CrossWorkgroup" %10 ["Aligned", 4] : f32
+      %12 = spirv.FAdd %7, %11 : f32
+      %13 = spirv.IMul %1, %cst5_i64 : i64
+      %14 = spirv.IAdd %13, %3 : i64
+      %15 = spirv.InBoundsPtrAccessChain %arg2[%14] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      spirv.Store "CrossWorkgroup" %15, %12 ["Aligned", 4] : f32
+      spirv.Return
+    }
+    spirv.EntryPoint "Kernel" @addt_kernel, @__builtin_var_WorkgroupId__
+  }
+  gpu.module @addt_kernel {
+    gpu.func @addt_kernel(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c5 = arith.constant 5 : index
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %2 = arith.muli %0, %c5 : index
+      %3 = arith.addi %2, %1 : index
+      %4 = memref.load %arg0[%3] : memref<?xf32>
+      %5 = arith.muli %0, %c5 : index
+      %6 = arith.addi %5, %1 : index
+      %7 = memref.load %arg1[%6] : memref<?xf32>
+      %8 = arith.addf %4, %7 : f32
+      %9 = arith.muli %0, %c5 : index
+      %10 = arith.addi %9, %1 : index
+      memref.store %8, %arg2[%10] : memref<?xf32>
+      gpu.return
+    }
+  }
+}

>From a755e8f6ea0d919d62640b3aa41db93f793812dd Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 09:09:22 -0700
Subject: [PATCH 05/13] Add dependency.

---
 mlir/lib/Dialect/GPU/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index e2c1dc7adf646f7..38fa60ba06f59a9 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -97,6 +97,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   MLIRSupport
   MLIRROCDLTarget
   MLIRTransformUtils
+  MLIRSPIRVSerialization
   )
 
 add_subdirectory(TransformOps)

>From 1d5d04661841e50e29dac0620c33478b4a5f572b Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.pate at intel.com>
Date: Wed, 6 Sep 2023 17:17:30 +0000
Subject: [PATCH 06/13] Add Sycl Runtime Wrappers

---
 .../GPUCommon/GPUToLLVMConversion.cpp         |  24 +-
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   |   5 +-
 .../ExecutionEngine/RocmRuntimeWrappers.cpp   |   4 +-
 .../ExecutionEngine/SyclRuntimeWrappers.cpp   | 301 ++++--------------
 ...ower-launch-func-to-gpu-runtime-calls.mlir |   7 +-
 .../Integration/GPU/SYCL/gpu-to-spirv.mlir    |  50 +++
 6 files changed, 144 insertions(+), 247 deletions(-)
 create mode 100644 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index eddf3e9a47d0bc8..111cfbf93f26a9b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -101,7 +101,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
   FunctionCallBuilder moduleLoadCallBuilder = {
       "mgpuModuleLoad",
       llvmPointerType /* void *module */,
-      {llvmPointerType /* void *cubin */}};
+      {llvmPointerType, /* void *cubin */
+       llvmInt64Type /* size_t size */}};
   FunctionCallBuilder moduleUnloadCallBuilder = {
       "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
   FunctionCallBuilder moduleGetFunctionCallBuilder = {
@@ -125,7 +126,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
           llvmInt32Type,          /* unsigned int sharedMemBytes */
           llvmPointerType,        /* void *hstream */
           llvmPointerPointerType, /* void **kernelParams */
-          llvmPointerPointerType  /* void **extra */
+          llvmPointerPointerType, /* void **extra */
+          llvmInt64Type           /* size_t paramsCount */
       }};
   FunctionCallBuilder streamCreateCallBuilder = {
       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
@@ -1134,7 +1136,21 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
       loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
       LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
 
-  auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
+  // SPIRV requires binary size
+  auto gpuBlob = binaryAttr.getValue();
+  auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type,
+      mlir::IntegerAttr::get(llvmInt64Type,
+                             static_cast<int64_t>(gpuBlob.size())));
+
+  auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type,
+      mlir::IntegerAttr::get(
+          llvmInt64Type,
+          static_cast<int64_t>(launchOp.getNumKernelOperands())));
+
+  auto module =
+      moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize});
   // Get the function from the module. The name corresponds to the name of
   // the kernel function.
   auto kernelName = generateKernelNameConstant(
@@ -1158,7 +1174,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
       {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
        adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
        adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
-       /*extra=*/nullpointer});
+       /*extra=*/nullpointer, paramsCount});
 
   if (launchOp.getAsyncToken()) {
     // Async launch: make dependent ops use the same stream.
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1dba677ebe66365..8a53d99c778a63a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false;
 #endif // MLIR_ENABLE_CUDA_CUSPARSELT
 #endif // MLIR_ENABLE_CUDA_CUSPARSE
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
+mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
   ScopedContext scopedContext;
   CUmodule module = nullptr;
   CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
@@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
                  intptr_t gridZ, intptr_t blockX, intptr_t blockY,
                  intptr_t blockZ, int32_t smem, CUstream stream, void **params,
-                 void **extra) {
+                 void **extra, size_t /*paramsCount*/) {
   ScopedContext scopedContext;
   int32_t maxShmem = 0;
   CUdevice device = getDefaultCuDevice();
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..998ff5b8b829f88 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -32,7 +32,7 @@
 
 thread_local static int32_t defaultDevice = 0;
 
-extern "C" hipModule_t mgpuModuleLoad(void *data) {
+extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
   hipModule_t module = nullptr;
   HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));
   return module;
@@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
                                  intptr_t blockX, intptr_t blockY,
                                  intptr_t blockZ, int32_t smem,
                                  hipStream_t stream, void **params,
-                                 void **extra) {
+                                 void **extra, size_t /*paramsCount*/) {
   HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
                                             blockX, blockY, blockZ, smem,
                                             stream, params, extra));
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index 439d31134aa8b9a..60ac27bd84e72fd 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -1,10 +1,14 @@
-//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===//
+//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the sycl runtime library.
+//
+//===----------------------------------------------------------------------===//
 
 #include <algorithm>
 #include <array>
@@ -32,7 +36,8 @@
 
 namespace {
 
-template <typename F> auto catchAll(F &&func) {
+template <typename F>
+auto catchAll(F &&func) {
   try {
     return func();
   } catch (const std::exception &e) {
@@ -58,46 +63,6 @@ template <typename F> auto catchAll(F &&func) {
 
 } // namespace
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wglobal-constructors"
-
-struct SpirvModule {
-  ze_module_handle_t module = nullptr;
-  ~SpirvModule();
-};
-
-namespace {
-// Create a Map for the spirv module lookup
-std::map<void *, SpirvModule> moduleCache;
-std::mutex mutexLock;
-} // namespace
-
-SpirvModule::~SpirvModule() {
-  L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
-}
-
-#pragma clang diagnostic pop
-
-struct ParamDesc {
-  void *data;
-  size_t size;
-
-  bool operator==(const ParamDesc &rhs) const {
-    return data == rhs.data && size == rhs.size;
-  }
-
-  bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
-};
-
-template <typename T> size_t countUntil(T *ptr, T &&elem) {
-  assert(ptr);
-  auto curr = ptr;
-  while (*curr != elem) {
-    ++curr;
-  }
-  return static_cast<size_t>(curr - ptr);
-}
-
 static sycl::device getDefaultDevice() {
   auto platformList = sycl::platform::get_platforms();
   for (const auto &platform : platformList) {
@@ -108,74 +73,39 @@ static sycl::device getDefaultDevice() {
 
     return platform.get_devices()[0];
   }
+  throw std::runtime_error("getDefaultDevice failed");
 }
 
-struct GPUSYCLQUEUE {
+// Create global device and context
+sycl::device syclDevice = getDefaultDevice();
+sycl::context syclContext = sycl::context(syclDevice);
 
-  sycl::device syclDevice_;
-  sycl::context syclContext_;
+struct QUEUE {
   sycl::queue syclQueue_;
 
-  GPUSYCLQUEUE(sycl::property_list propList) {
-
-    syclDevice_ = getDefaultDevice();
-    syclContext_ = sycl::context(syclDevice_);
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
-  }
-
-  GPUSYCLQUEUE(sycl::device *device, sycl::context *context,
-               sycl::property_list propList) {
-    syclDevice_ = *device;
-    syclContext_ = *context;
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
-  }
-  GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) {
-
-    syclDevice_ = *device;
-    syclContext_ = sycl::context(syclDevice_);
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
-  }
-
-  GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) {
-
-    syclDevice_ = getDefaultDevice();
-    syclContext_ = *context;
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
-  }
-
-}; // end of GPUSYCLQUEUE
+  QUEUE() { syclQueue_ = sycl::queue(syclContext, syclDevice); }
+};
 
-static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size,
-                               size_t alignment, bool isShared) {
+static void *allocDeviceMemory(QUEUE *queue, size_t size, bool isShared) {
   void *memPtr = nullptr;
   if (isShared) {
-    memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_);
+    memPtr = sycl::aligned_alloc_shared(64, size, syclDevice, syclContext);
   } else {
-    memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_);
+    memPtr = sycl::aligned_alloc_device(64, size, syclDevice, syclContext);
   }
   if (memPtr == nullptr) {
-    throw std::runtime_error(
-        "aligned_alloc_shared() failed to allocate memory!");
+    throw std::runtime_error("mem allocation failed!");
   }
   return memPtr;
 }
 
-static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) {
+static void deallocDeviceMemory(QUEUE *queue, void *ptr) {
   sycl::free(ptr, queue->syclQueue_);
 }
 
-static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
-                                     size_t dataSize) {
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
   assert(data);
-  auto syclQueue = queue->syclQueue_;
   ze_module_handle_t zeModule;
-
-  auto it = moduleCache.find((void *)data);
-  // Check the map if the module is present/cached.
-  if (it != moduleCache.end()) {
-    return it->second.module;
-  }
-
   ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
                            nullptr,
                            ZE_MODULE_FORMAT_IL_SPIRV,
@@ -183,21 +113,17 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
                            (const uint8_t *)data,
                            nullptr,
                            nullptr};
-  auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
-      syclQueue.get_device());
-  auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
-      syclQueue.get_context());
+  auto zeDevice =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice);
+  auto zeContext =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext);
   L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
-  std::lock_guard<std::mutex> entryLock(mutexLock);
-  moduleCache[(void *)data].module = zeModule;
   return zeModule;
 }
 
-static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
-                               const char *name) {
+static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) {
   assert(zeModule);
   assert(name);
-  auto syclQueue = queue->syclQueue_;
   ze_kernel_handle_t zeKernel;
   sycl::kernel *syclKernel;
   ze_kernel_desc_t desc = {};
@@ -206,186 +132,87 @@ static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
   L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
   sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
       sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
-                               sycl::bundle_state::executable>(
-          {zeModule}, syclQueue.get_context());
+                               sycl::bundle_state::executable>({zeModule},
+                                                               syclContext);
 
   auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
-      {kernelBundle, zeKernel}, syclQueue.get_context());
+      {kernelBundle, zeKernel}, syclContext);
   syclKernel = new sycl::kernel(kernel);
   return syclKernel;
 }
 
-static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel,
-                                 sycl::nd_range<3> NdRange, ParamDesc *params,
-                                 size_t sharedMemBytes) {
-  auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
-  // The assumption is, if there is a param for the shared local memory,
-  // then that will always be the last argument.
-  if (sharedMemBytes) {
-    paramsCount = paramsCount - 1;
-  }
-  sycl::event event = queue.submit([&](sycl::handler &cgh) {
-    for (size_t i = 0; i < paramsCount; i++) {
-      auto param = params[i];
-      cgh.set_arg(static_cast<uint32_t>(i),
-                  *(static_cast<void **>(param.data)));
-    }
-    if (sharedMemBytes) {
-      // TODO: Handle other data types
-      using share_mem_t =
-          sycl::accessor<float, 1, sycl::access::mode::read_write,
-                         sycl::access::target::local>;
-      share_mem_t local_buffer =
-          share_mem_t(sharedMemBytes / sizeof(float), cgh);
-      cgh.set_arg(paramsCount, local_buffer);
-      cgh.parallel_for(NdRange, *kernel);
-    } else {
-      cgh.parallel_for(NdRange, *kernel);
-    }
-  });
-  return event;
-}
-
-static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
-                         size_t gridX, size_t gridY, size_t gridZ,
-                         size_t blockX, size_t blockY, size_t blockZ,
-                         size_t sharedMemBytes, ParamDesc *params) {
-  auto syclQueue = queue->syclQueue_;
+static void launchKernel(QUEUE *queue, sycl::kernel *kernel, size_t gridX,
+                         size_t gridY, size_t gridZ, size_t blockX,
+                         size_t blockY, size_t blockZ, size_t sharedMemBytes,
+                         void **params, size_t paramsCount) {
   auto syclGlobalRange =
       ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
   auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
   sycl::nd_range<3> syclNdRange(
       sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
 
-  if (getenv("IMEX_ENABLE_PROFILING")) {
-    auto executionTime = 0.0f;
-    auto maxTime = 0.0f;
-    auto minTime = FLT_MAX;
-    auto rounds = 100;
-    auto warmups = 3;
-
-    if (getenv("IMEX_PROFILING_RUNS")) {
-      auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
-      if (runs)
-        rounds = runs;
-    }
-
-    if (getenv("IMEX_PROFILING_WARMUPS")) {
-      auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
-      if (warmups)
-        warmups = runs;
-    }
-
-    // warmups
-    for (int r = 0; r < warmups; r++) {
-      enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
-    }
-
-    for (int r = 0; r < rounds; r++) {
-      sycl::event event =
-          enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
-
-      auto startTime = event.get_profiling_info<
-          cl::sycl::info::event_profiling::command_start>();
-      auto endTime = event.get_profiling_info<
-          cl::sycl::info::event_profiling::command_end>();
-      auto gap = float(endTime - startTime) / 1000000.0f;
-      executionTime += gap;
-      if (gap > maxTime)
-        maxTime = gap;
-      if (gap < minTime)
-        minTime = gap;
+  queue->syclQueue_.submit([&](sycl::handler &cgh) {
+    for (size_t i = 0; i < paramsCount; i++) {
+      cgh.set_arg(static_cast<uint32_t>(i), *(static_cast<void **>(params[i])));
     }
-
-    fprintf(stdout,
-            "the kernel execution time is (ms):"
-            "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
-            executionTime / rounds, minTime, maxTime, rounds);
-  } else {
-    enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
-  }
+    cgh.parallel_for(syclNdRange, *kernel);
+  });
 }
 
 // Wrappers
 
-extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
-                                                             void *context) {
-  auto propList = sycl::property_list{};
-  if (getenv("IMEX_ENABLE_PROFILING")) {
-    propList = sycl::property_list{sycl::property::queue::enable_profiling()};
-  }
-  return catchAll([&]() {
-    if (!device && !context) {
-      return new GPUSYCLQUEUE(propList);
-    } else if (device && context) {
-      // TODO: Check if the pointers/address is valid and holds the correct
-      // device and context
-      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device),
-                              static_cast<sycl::context *>(context), propList);
-    } else if (device && !context) {
-      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList);
-    } else {
-      return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList);
-    }
-  });
+extern "C" SYCL_RUNTIME_EXPORT QUEUE *mgpuStreamCreate() {
+
+  return catchAll([&]() { return new QUEUE(); });
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(QUEUE *queue) {
   catchAll([&]() { delete queue; });
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void *
-gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+extern "C" SYCL_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t size, QUEUE *queue,
+                                                  bool isShared) {
   return catchAll([&]() {
-    if (queue) {
-      return allocDeviceMemory(queue, size, alignment, isShared);
-    }
+    return allocDeviceMemory(queue, static_cast<size_t>(size), true);
   });
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, QUEUE *queue) {
   catchAll([&]() {
-    if (queue && ptr) {
+    if (ptr) {
       deallocDeviceMemory(queue, ptr);
     }
   });
 }
 
 extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
-gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) {
-  return catchAll([&]() {
-    if (queue) {
-      return loadModule(queue, data, dataSize);
-    }
-  });
+mgpuModuleLoad(const void *data, size_t gpuBlobSize) {
+  return catchAll([&]() { return loadModule(data, gpuBlobSize); });
 }
 
 extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
-gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) {
-  return catchAll([&]() {
-    if (queue) {
-      return getKernel(queue, module, name);
-    }
-  });
+mgpuModuleGetFunction(ze_module_handle_t module, const char *name) {
+  return catchAll([&]() { return getKernel(module, name); });
 }
 
 extern "C" SYCL_RUNTIME_EXPORT void
-gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX,
-                size_t gridY, size_t gridZ, size_t blockX, size_t blockY,
-                size_t blockZ, size_t sharedMemBytes, void *params) {
+mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ,
+                 size_t blockX, size_t blockY, size_t blockZ,
+                 size_t sharedMemBytes, QUEUE *queue, void **params,
+                 void **extra, size_t paramsCount) {
   return catchAll([&]() {
-    if (queue) {
-      launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
-                   sharedMemBytes, static_cast<ParamDesc *>(params));
-    }
+    launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                 sharedMemBytes, params, paramsCount);
   });
 }
 
-extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) {
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(QUEUE *queue) {
 
-  catchAll([&]() {
-    if (queue) {
-      queue->syclQueue_.wait();
-    }
-  });
+  catchAll([&]() { queue->syclQueue_.wait(); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuModuleUnload(ze_module_handle_t module) {
+
+  catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); });
 }
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 2cdc4e8dbb1ad67..96e8a6dbd35b171 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -34,8 +34,10 @@ module attributes {gpu.container_module} {
   // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
   // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
   // CHECK-SAME: -> !llvm.ptr
+  // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant
+  // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant
 
-  // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]])
+  // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]])
   // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
 
   // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
@@ -53,10 +55,11 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
 
   // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr
+  
 
   // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
   // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
-  // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]])
+  // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]])
   // CHECK: llvm.call @mgpuStreamSynchronize
   // CHECK: llvm.call @mgpuStreamDestroy
   // CHECK: llvm.call @mgpuModuleUnload
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
new file mode 100644
index 000000000000000..0be0e31a3d71e94
--- /dev/null
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -0,0 +1,50 @@
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+  memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+    %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+    %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+    %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+    call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+    return
+  }
+  func.func private @printMemrefI64(memref<*xi64>)
+  func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+  %c3 = arith.constant 3 : index
+  %c1 = arith.constant 1 : index
+  %0 = gpu.wait async
+  %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64>
+  gpu.wait [%asyncToken]
+  memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64>
+  %1 = gpu.wait async
+  %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64>
+  gpu.wait [%asyncToken_1]
+  memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+  %2 = gpu.wait async
+  %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64>
+  %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+  gpu.wait [%3]
+  %alloc = memref.alloc() : memref<3x3xi64>
+  memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+  %4 = gpu.wait async
+  %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+  %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+  %7 = gpu.dealloc async [%6] %memref : memref<3x3xi64>
+  gpu.wait [%7]
+  return %alloc : memref<3x3xi64>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+      %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+      %4 = arith.addi %2, %3 : i64
+      memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+      gpu.return
+    }
+  }
+}
+
+ 
\ No newline at end of file

>From 66fe69f30c015c2619d840a4ef91b18219ebf7c7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 19:23:39 +0000
Subject: [PATCH 07/13] Temp save. Builds but Integration test fails.

---
 mlir/test/CMakeLists.txt                         |  4 ++++
 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 11 +++++++++--
 mlir/test/lit.cfg.py                             |  3 +++
 mlir/test/lit.site.cfg.py.in                     |  1 +
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba5d..874e7718f4a36d1 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -135,6 +135,10 @@ if(MLIR_ENABLE_ROCM_RUNNER)
   list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
 endif()
 
+if(MLIR_ENABLE_SYCL_RUNNER)
+  list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index 0be0e31a3d71e94..6ff9d4bf6ca8f5a 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -1,3 +1,10 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
 module @add attributes {gpu.container_module} {
   memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
   memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
@@ -34,7 +41,7 @@ module @add attributes {gpu.container_module} {
   gpu.wait [%7]
   return %alloc : memref<3x3xi64>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
@@ -47,4 +54,4 @@ module @add attributes {gpu.container_module} {
   }
 }
 
- 
\ No newline at end of file
+ 
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index f265ac794c6f6db..5d3a4dc575a7b28 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -119,6 +119,9 @@ def add_runtime(name):
 if config.enable_cuda_runner:
     tools.extend([add_runtime("mlir_cuda_runtime")])
 
+if config.enable_sycl_runner:
+    tools.extend([add_runtime("mlir_sycl_runtime")])
+
 # The following tools are optional
 tools.extend(
     [
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07c0..897c12f3abcac75 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -30,6 +30,7 @@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
 config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@

>From b0758f0ed491b7264cbdd10fc4e82d280e5298f2 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.pate at intel.com>
Date: Wed, 6 Sep 2023 17:17:30 +0000
Subject: [PATCH 08/13] Update Sycl Runtime Wrappers

---
 .../lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 12 ++++++++++--
 mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp     |  3 ++-
 mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp     |  3 ++-
 .../GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir  |  3 ++-
 mlir/test/Conversion/GPUCommon/typed-pointers.mlir   |  3 ++-
 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir     |  8 +++-----
 6 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 111cfbf93f26a9b..2b92c1cd8b00e50 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -169,7 +169,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       "mgpuMemAlloc",
       llvmPointerType /* void * */,
       {llvmIntPtrType /* intptr_t sizeBytes */,
-       llvmPointerType /* void *stream */}};
+       llvmPointerType /* void *stream */,
+       llvmInt64Type /* size_t isHostShared */}};
   FunctionCallBuilder deallocCallBuilder = {
       "mgpuMemFree",
       llvmVoidType,
@@ -801,6 +802,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   auto loc = allocOp.getLoc();
 
+  bool isShared = allocOp.getHostShared();
+
   // Get shape of the memref as values: static sizes are constant
   // values and dynamic sizes are passed to 'alloc' as operands.
   SmallVector<Value, 4> shape;
@@ -813,8 +816,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   // descriptor.
   Type elementPtrType = this->getElementPtrType(memRefType);
   auto stream = adaptor.getAsyncDependencies().front();
+
+  auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+
   Value allocatedPtr =
-      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+          .getResult();
   if (!getTypeConverter()->useOpaquePointers())
     allocatedPtr =
         rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 8a53d99c778a63a..79dc2eed38f06a9 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -211,7 +211,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+                              bool /*isHostShared*/) {
   ScopedContext scopedContext;
   CUdeviceptr ptr;
   CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 998ff5b8b829f88..b50fd7eb9d05929 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
   HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+                              bool /*isHostShared*/) {
   void *ptr;
   HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));
   return ptr;
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 2506c6ceb990ef5..f365dcb02daf4c2 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant 
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
index 2fa6c854c567819..e27162c7dbc1902 100644
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index 6ff9d4bf6ca8f5a..b6180d322ce9329 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -21,15 +21,15 @@ module @add attributes {gpu.container_module} {
   %c3 = arith.constant 3 : index
   %c1 = arith.constant 1 : index
   %0 = gpu.wait async
-  %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64>
+  %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64>
   gpu.wait [%asyncToken]
   memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64>
   %1 = gpu.wait async
-  %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64>
+  %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<3x3xi64>
   gpu.wait [%asyncToken_1]
   memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
   %2 = gpu.wait async
-  %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64>
+  %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<3x3xi64>
   %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
   gpu.wait [%3]
   %alloc = memref.alloc() : memref<3x3xi64>
@@ -53,5 +53,3 @@ module @add attributes {gpu.container_module} {
     }
   }
 }
-
- 

>From c37e8d84ea6a9ec84b27ffe30f47b6acfbbda202 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 20:35:53 +0000
Subject: [PATCH 09/13] Add f32 integration test.

---
 mlir/test/Integration/GPU/SYCL/addf.mlir | 58 ++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir

diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
new file mode 100644
index 000000000000000..b9fc36547220e77
--- /dev/null
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+} {
+  memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
+  memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_9xf32 : memref<9xf32>
+    %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
+    %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
+    %cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
+    call @printMemrefI64(%cast) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefI64(memref<*xf32>)
+  func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
+  %c9 = arith.constant 9 : index
+  %c1 = arith.constant 1 : index
+  %0 = gpu.wait async
+  %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32>
+  gpu.wait [%asyncToken]
+  memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32>
+  %1 = gpu.wait async
+  %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32>
+  gpu.wait [%asyncToken_1]
+  memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32>
+  %2 = gpu.wait async
+  %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32>
+  %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>)
+  gpu.wait [%3]
+  %alloc = memref.alloc() : memref<9xf32>
+  memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32>
+  %4 = gpu.wait async
+  %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32>
+  %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32>
+  %7 = gpu.dealloc async [%6] %memref : memref<9xf32>
+  gpu.wait [%7]
+  return %alloc : memref<9xf32>
+  }
+  gpu.module @test_kernel {
+    gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      //%0 = gpu.block_id  x
+      //%2 = memref.load %arg0[%0] : memref<9xf32>
+      //%3 = memref.load %arg1[%0] : memref<9xf32>
+      //%4 = arith.addf %2, %3 : f32
+      //memref.store %4, %arg2[%0] : memref<9xf32>
+      gpu.return
+    }
+  }
+  // CHECK: [3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3]
+}

>From 84d584d5ba21a8177eca4818a076daf1d471b38c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:26:30 +0000
Subject: [PATCH 10/13] Add new option to convert-gpu-to-spirv pass to handle
 OpenCL

---
 mlir/include/mlir/Conversion/Passes.td            | 5 ++++-
 mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ed37abf85275bf3..3bb6006a467fe37 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -568,7 +568,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   let options = [
     Option<"use64bitIndex", "use-64bit-index",
            "bool", /*default=*/"false",
-           "Use 64-bit integers to convert index types">
+           "Use 64-bit integers to convert index types">,
+    Option<"useOpenCL", "use-opencl",
+           "bool", /*default=*/"false",
+           "Use OpenCL instead of Vulkan">
   ];
 }
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index f37c70a771f5916..a52c99ec9daec16 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -71,7 +71,8 @@ void GPUToSPIRVPass::runOnOperation() {
       std::unique_ptr<ConversionTarget> target =
           spirv::getMemorySpaceToStorageClassTarget(*context);
       spirv::MemorySpaceToStorageClassMap memorySpaceMap =
-          spirv::mapMemorySpaceToVulkanStorageClass;
+          this->useOpenCL ? spirv::mapMemorySpaceToOpenCLStorageClass :
+              spirv::mapMemorySpaceToVulkanStorageClass;
       spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap);
 
       RewritePatternSet patterns(context);

>From 2f8ab53057dd7a1433bc4e867a265ee4a215bb2b Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:27:10 +0000
Subject: [PATCH 11/13] Update pass pipeline for integration tests.

---
 mlir/test/Integration/GPU/SYCL/addf.mlir         | 2 +-
 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
index b9fc36547220e77..984b1e0ae528723 100644
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_sycl_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index b6180d322ce9329..c52e723af68f32b 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_sycl_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \

>From 41bc1125acb049e655aa9c934b8e3da1f64cc26f Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:55:10 +0000
Subject: [PATCH 12/13] Fix.

---
 mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp |  4 ----
 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp      |  5 +++++
 mlir/test/Integration/GPU/SYCL/addf.mlir              | 11 ++++++-----
 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir      |  3 +++
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 2b92c1cd8b00e50..808431b82472471 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -789,10 +789,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::AllocOp allocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  if (adaptor.getHostShared())
-    return rewriter.notifyMatchFailure(
-        allocOp, "host_shared allocation is not supported");
-
   MemRefType memRefType = allocOp.getType();
 
   if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index 60ac27bd84e72fd..6b40d4a6922c9f0 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -76,10 +76,15 @@ static sycl::device getDefaultDevice() {
   throw std::runtime_error("getDefaultDevice failed");
 }
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+
 // Create global device and context
 sycl::device syclDevice = getDefaultDevice();
 sycl::context syclContext = sycl::context(syclDevice);
 
+#pragma clang diagnostic pop
+
 struct QUEUE {
   sycl::queue syclQueue_;
 
diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
index 984b1e0ae528723..d4e00ddedf3877c 100644
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ b/mlir/test/Integration/GPU/SYCL/addf.mlir
@@ -6,8 +6,7 @@
 // RUN: | FileCheck %s
 
 module @add attributes {
-  gpu.container_module,
-  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+  gpu.container_module
 } {
   memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
   memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
@@ -16,10 +15,10 @@ module @add attributes {
     %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
     %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
     %cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
-    call @printMemrefI64(%cast) : (memref<*xf32>) -> ()
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
     return
   }
-  func.func private @printMemrefI64(memref<*xf32>)
+  func.func private @printMemrefF32(memref<*xf32>)
   func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
   %c9 = arith.constant 9 : index
   %c1 = arith.constant 1 : index
@@ -44,7 +43,9 @@ module @add attributes {
   gpu.wait [%7]
   return %alloc : memref<9xf32>
   }
-  gpu.module @test_kernel {
+  gpu.module @test_kernel attributes {
+    spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
+  }{
     gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       //%0 = gpu.block_id  x
       //%2 = memref.load %arg0[%0] : memref<9xf32>
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
index c52e723af68f32b..36d132d0c94d32e 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -52,4 +52,7 @@ module @add attributes {gpu.container_module} {
       gpu.return
     }
   }
+  // CHECK: [2,   4100,   6],
+  // CHECK: [16777224,   10,   4294971404],
+  // CHECK: [16777230,   1103806595088,   1099511627794]
 }

>From aae8d757874239e9362eb60ea0eebfd572a0303f Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 6 Sep 2023 21:56:16 +0000
Subject: [PATCH 13/13] Remove unneeded test.

---
 mlir/test/Integration/GPU/SYCL/addf.mlir | 59 ------------------------
 1 file changed, 59 deletions(-)
 delete mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir

diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir
deleted file mode 100644
index d4e00ddedf3877c..000000000000000
--- a/mlir/test/Integration/GPU/SYCL/addf.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
-// RUN: | mlir-cpu-runner \
-// RUN:   --shared-libs=%mlir_sycl_runtime \
-// RUN:   --shared-libs=%mlir_runner_utils \
-// RUN:   --entry-point-result=void \
-// RUN: | FileCheck %s
-
-module @add attributes {
-  gpu.container_module
-} {
-  memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]>
-  memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]>
-  func.func @main() {
-    %0 = memref.get_global @__constant_9xf32 : memref<9xf32>
-    %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32>
-    %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32>
-    %cast = memref.cast %2 : memref<9xf32> to memref<*xf32>
-    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
-    return
-  }
-  func.func private @printMemrefF32(memref<*xf32>)
-  func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> {
-  %c9 = arith.constant 9 : index
-  %c1 = arith.constant 1 : index
-  %0 = gpu.wait async
-  %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32>
-  gpu.wait [%asyncToken]
-  memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32>
-  %1 = gpu.wait async
-  %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32>
-  gpu.wait [%asyncToken_1]
-  memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32>
-  %2 = gpu.wait async
-  %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32>
-  %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>)
-  gpu.wait [%3]
-  %alloc = memref.alloc() : memref<9xf32>
-  memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32>
-  %4 = gpu.wait async
-  %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32>
-  %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32>
-  %7 = gpu.dealloc async [%6] %memref : memref<9xf32>
-  gpu.wait [%7]
-  return %alloc : memref<9xf32>
-  }
-  gpu.module @test_kernel attributes {
-    spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>
-  }{
-    gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      //%0 = gpu.block_id  x
-      //%2 = memref.load %arg0[%0] : memref<9xf32>
-      //%3 = memref.load %arg1[%0] : memref<9xf32>
-      //%4 = arith.addf %2, %3 : f32
-      //memref.store %4, %arg2[%0] : memref<9xf32>
-      gpu.return
-    }
-  }
-  // CHECK: [3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3,  3.3]
-}