[Mlir-commits] [mlir] [MLIR][GPU][XeVM] Add XeVM target and XeVM dialect integration tests. (PR #148286)

Thu Aug 7 10:10:51 PDT 2025

https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/148286

>From e6bcef6f83d6f079c8dab5ad9b3a355a5fb445c4 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 11 Jul 2025 20:13:51 +0000
Subject: [PATCH 1/9] Add XeVM target and XeVM dialect integration tests.
 Covers remaining parts required for XeVM dialect intgration testing. It has
 two high level components - XeVM target and serialization support - XeVM
 dialect integration tests using SYCL runtime

---
 mlir/CMakeLists.txt                           |   8 +
 mlir/include/mlir/InitAllDialects.h           |   2 +
 mlir/include/mlir/Target/LLVM/XeVM/Target.h   |  31 +++
 mlir/include/mlir/Target/LLVM/XeVM/Utils.h    |  39 +++
 mlir/include/mlir/Target/LLVMIR/Dialect/All.h |   3 +
 .../Dialect/XeVM/XeVMToLLVMIRTranslation.h    |  33 +++
 .../GPU/Transforms/XeVMAttachTarget.cpp       |   1 +
 mlir/lib/Target/LLVM/CMakeLists.txt           |  32 +++
 mlir/lib/Target/LLVM/XeVM/Target.cpp          | 257 ++++++++++++++++++
 mlir/lib/Target/LLVMIR/CMakeLists.txt         |   1 +
 mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt |   1 +
 .../Target/LLVMIR/Dialect/XeVM/CMakeLists.txt |  21 ++
 .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp  | 117 ++++++++
 .../Dialect/XeVM/GPU/lit.local.cfg            |   4 +
 .../Dialect/XeVM/GPU/xevm_block_dpas.mlir     | 135 +++++++++
 .../XeVM/GPU/xevm_block_load_store.mlir       | 103 +++++++
 .../xevm_block_load_store_pack_register.mlir  | 119 ++++++++
 .../GPU/xevm_block_load_store_transpose.mlir  | 127 +++++++++
 .../Dialect/XeVM/GPU/xevm_store_cst.mlir      |  74 +++++
 mlir/test/lib/Dialect/GPU/CMakeLists.txt      |   1 +
 mlir/test/lit.site.cfg.py.in                  |   1 +
 21 files changed, 1110 insertions(+)
 create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Target.h
 create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Utils.h
 create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
 create mode 100644 mlir/lib/Target/LLVM/XeVM/Target.cpp
 create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
 create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
 create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index a1ad81f625cd6..7c9d62051f9f8 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -137,6 +137,14 @@ else()
   set(MLIR_ENABLE_ROCM_CONVERSIONS 0)
 endif()
 
+# Build the XeVM conversions and run according tests if the SPIRV backend
+# is available.
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_ENABLE_XEVM_CONVERSIONS 1)
+else()
+  set(MLIR_ENABLE_XEVM_CONVERSIONS 0)
+endif()
+
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
 set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index c6fcf1a0d510b..79dcafe69f0a5 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -102,6 +102,7 @@
 #include "mlir/Interfaces/CastInterfaces.h"
 #include "mlir/Target/LLVM/NVVM/Target.h"
 #include "mlir/Target/LLVM/ROCDL/Target.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
 #include "mlir/Target/SPIRV/Target.h"
 
 namespace mlir {
@@ -200,6 +201,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   NVVM::registerNVVMTargetInterfaceExternalModels(registry);
   ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
   spirv::registerSPIRVTargetInterfaceExternalModels(registry);
+  xevm::registerXeVMTargetInterfaceExternalModels(registry);
 }
 
 /// Append all the MLIR dialects to the registry contained in the given context.
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Target.h b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
new file mode 100644
index 0000000000000..31a93d0ebabfc
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
@@ -0,0 +1,31 @@
+//===-- Target.h - MLIR XeVM target registration ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for attaching the XeVM target interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_XEVM_TARGET_H
+#define MLIR_TARGET_XEVM_TARGET_H
+
+namespace mlir {
+class DialectRegistry;
+class MLIRContext;
+} // namespace mlir
+
+namespace mlir::xevm {
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the given registry.
+void registerXeVMTargetInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the registry associated with the given context.
+void registerXeVMTargetInterfaceExternalModels(mlir::MLIRContext &context);
+} // namespace mlir::xevm
+
+#endif // MLIR_TARGET_XEVM_TARGET_H
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
new file mode 100644
index 0000000000000..c11a97f0d960a
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
@@ -0,0 +1,39 @@
+//===-- Utils.h - MLIR XeVM target utils ------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files declares XeVM target related utility classes and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_XEVM_UTILS_H
+#define MLIR_TARGET_LLVM_XEVM_UTILS_H
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/Target/LLVM/ModuleToObject.h"
+
+namespace mlir {
+namespace xevm {
+
+/// Base class for all XeVM serializations from GPU modules into binary strings.
+/// By default this class serializes into LLVM bitcode.
+class SerializeGPUModuleBase : public mlir::LLVM::ModuleToObject {
+public:
+  SerializeGPUModuleBase(mlir::Operation &module, XeVMTargetAttr target,
+                         const mlir::gpu::TargetOptions &targetOptions = {});
+
+  static void init();
+  XeVMTargetAttr getTarget() const;
+
+protected:
+  XeVMTargetAttr target;
+};
+} // namespace xevm
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_XEVM_UTILS_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 60615cf601655..e4670cb1a9622 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -28,6 +28,7 @@
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
 
 namespace mlir {
 class DialectRegistry;
@@ -47,6 +48,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry &registry) {
   registerROCDLDialectTranslation(registry);
   registerSPIRVDialectTranslation(registry);
   registerVCIXDialectTranslation(registry);
+  registerXeVMDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
@@ -63,6 +65,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry &registry) {
   registerNVVMDialectTranslation(registry);
   registerROCDLDialectTranslation(registry);
   registerSPIRVDialectTranslation(registry);
+  registerXeVMDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
new file mode 100644
index 0000000000000..149a2119657d5
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
@@ -0,0 +1,33 @@
+//===-- XeVMToLLVMIRTranslation.h - XeVM to LLVM IR -------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for XeVM dialect to LLVM IR translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+
+namespace mlir {
+
+class DialectRegistry;
+class MLIRContext;
+} // namespace mlir
+
+namespace mlir {
+/// Register the XeVM dialect and the translation from it to the LLVM IR in the
+/// given registry;
+void registerXeVMDialectTranslation(mlir::DialectRegistry &registry);
+
+/// Register the XeVM dialect and the translation from it in the registry
+/// associated with the given context.
+void registerXeVMDialectTranslation(mlir::MLIRContext &context);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
diff --git a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
index e9cf4939a13b8..6da76e9e7a331 100644
--- a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
 #include "llvm/Support/Regex.h"
 
 namespace mlir {
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 83fbf7a5fe5f3..ed15c5d2ab2ca 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -209,3 +209,35 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
   )
 endif()
 
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(SPIRV_LIBS
+    SPIRVCodeGen
+
+  )
+endif()
+
+add_mlir_dialect_library(MLIRXeVMTarget
+  XeVM/Target.cpp
+
+  OBJECT
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+
+  LINK_COMPONENTS
+  ${SPIRV_LIBS}
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRExecutionEngineUtils
+  MLIRSupport
+  MLIRGPUDialect
+  MLIRTargetLLVM
+  MLIRXeVMToLLVMIRTranslation
+)
+
+# Ensure SPIRV headers are included. Warning: references build directory!
+target_include_directories(MLIRXeVMTarget PRIVATE
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/SPIRV
+  ${LLVM_BINARY_DIR}/lib/Target/SPIRV
+)
diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
new file mode 100644
index 0000000000000..380e2bff222ca
--- /dev/null
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -0,0 +1,257 @@
+//===-- Target.cpp - MLIR LLVM XeVM target compilation ----------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines XeVM target related functions including registration
+// calls for the `#xevm.target` compilation attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/XeVM/Target.h"
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/ExtensibleDialect.h"
+#include "mlir/Target/LLVM/XeVM/Utils.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+
+#include "llvm/Config/Targets.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+// FIXME: One of the headers uses `.inc` file from the build directory, this
+// does not work for installation (i.e., DCMAKE_INSTALL_PREFIX) caching as build
+// directory will not be cached. Since float atomics are not yet supported by
+// the backend anyway, we can afford to temporarily comment this section.
+
+// #if LLVM_HAS_SPIRV_TARGET
+// #pragma GCC diagnostic push
+// #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+// #include "SPIRVTargetMachine.h"
+// #pragma GCC diagnostic pop
+
+// #include "SPIRVCommandLine.h"
+// #endif // LLVM_HAS_SPIRV_TARGET
+
+#include <set>
+
+using namespace mlir;
+
+namespace {
+// XeVM implementation of the gpu:TargetAttrInterface.
+class XeVMTargetAttrImpl
+    : public gpu::TargetAttrInterface::FallbackModel<XeVMTargetAttrImpl> {
+public:
+  std::optional<SmallVector<char, 0>>
+  serializeToObject(Attribute attribute, Operation *module,
+                    const gpu::TargetOptions &options) const;
+
+  Attribute createObject(Attribute attribute, Operation *module,
+                         const SmallVector<char, 0> &object,
+                         const gpu::TargetOptions &options) const;
+};
+} // namespace
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(
+      +[](MLIRContext *ctx, mlir::xevm::XeVMDialect *dialect) {
+        mlir::xevm::XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
+      });
+}
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+    MLIRContext &context) {
+  DialectRegistry registry;
+  registerXeVMTargetInterfaceExternalModels(registry);
+  context.appendDialectRegistry(registry);
+}
+
+mlir::xevm::SerializeGPUModuleBase::SerializeGPUModuleBase(
+    Operation &module, mlir::xevm::XeVMTargetAttr target,
+    const gpu::TargetOptions &targetOptions)
+    : ModuleToObject(module, target.getTriple(), "", {}, target.getO()),
+      target(target) {}
+
+void mlir::xevm::SerializeGPUModuleBase::init() {
+  static llvm::once_flag initializeBackendOnce;
+  llvm::call_once(initializeBackendOnce, []() {
+#if LLVM_HAS_SPIRV_TARGET
+    LLVMInitializeSPIRVTarget();
+    LLVMInitializeSPIRVTargetInfo();
+    LLVMInitializeSPIRVTargetMC();
+    LLVMInitializeSPIRVAsmPrinter();
+#endif
+  });
+}
+
+mlir::xevm::XeVMTargetAttr
+mlir::xevm::SerializeGPUModuleBase::getTarget() const {
+  return target;
+}
+
+namespace {
+class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase {
+public:
+  SpirSerializer(Operation &module, mlir::xevm::XeVMTargetAttr target,
+                 const gpu::TargetOptions &targetOptions)
+      : mlir::xevm::SerializeGPUModuleBase(module, target, targetOptions) {}
+
+  gpu::GPUModuleOp getOperation();
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  std::optional<std::string>
+  translateToSPIRVBinary(llvm::Module &llvmModule,
+                         llvm::TargetMachine &targetMachine);
+  gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+gpu::GPUModuleOp SpirSerializer::getOperation() {
+  return dyn_cast<gpu::GPUModuleOp>(
+      &mlir::xevm::SerializeGPUModuleBase::getOperation());
+}
+
+std::optional<SmallVector<char, 0>>
+SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
+  // Return LLVM IR if the compilation target is `offload`.
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
+    return mlir::xevm::SerializeGPUModuleBase::moduleToObject(llvmModule);
+
+#if !LLVM_HAS_SPIRV_TARGET
+  getOperation()->emitError(
+      "The `SPIRV` target was not built. Please enable it when building LLVM.");
+  return std::nullopt;
+#endif // LLVM_HAS_SPIRV_TARGET
+
+  std::optional<llvm::TargetMachine *> targetMachine =
+      getOrCreateTargetMachine();
+  if (!targetMachine) {
+    getOperation().emitError() << "Target Machine unavailable for triple "
+                               << triple << ", can't compile with LLVM\n";
+    return std::nullopt;
+  }
+
+  //===----------------------------------------------------------------------===//
+  // Workaround to enable spirv extensions that are not added to target machine
+  // by default.
+
+  // FIXME: see fixme comment above SPIRV headers.
+  // #if LLVM_HAS_SPIRV_TARGET
+  //   std::set<llvm::SPIRV::Extension::Extension> AllowedExtIds{
+  //       llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add,
+  //       llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add};
+  //   llvm::SPIRVTargetMachine *STM =
+  //       static_cast<llvm::SPIRVTargetMachine *>(targetMachine.value());
+  //   const_cast<llvm::SPIRVSubtarget *>(STM->getSubtargetImpl())
+  //       ->initAvailableExtensions(AllowedExtIds);
+  // #endif // LLVM_HAS_SPIRV_TARGET
+
+  //===----------------------------------------------------------------------===//
+
+  // Return SPIRV if the compilation target is `assembly`.
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
+    std::optional<std::string> serializedISA =
+        translateToISA(llvmModule, **targetMachine);
+    if (!serializedISA) {
+      getOperation().emitError() << "Failed translating the module to ISA.";
+      return std::nullopt;
+    }
+    // Make sure to include the null terminator.
+    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+    return SmallVector<char, 0>(bin.begin(), bin.end());
+  }
+
+  std::optional<std::string> serializedSPIRVBinary =
+      translateToSPIRVBinary(llvmModule, **targetMachine);
+  if (!serializedSPIRVBinary) {
+    getOperation().emitError() << "Failed translating the module to Binary.";
+    return std::nullopt;
+  }
+  if (serializedSPIRVBinary->size() % 4) {
+    getOperation().emitError() << "SPIRV code size must be a multiple of 4.";
+    return std::nullopt;
+  }
+  StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
+  return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
+std::optional<std::string>
+SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CodeGenFileType::ObjectFile))
+      return std::nullopt;
+
+    codegenPasses.run(llvmModule);
+  }
+  return targetISA;
+}
+
+std::optional<SmallVector<char, 0>>
+XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
+                                      const gpu::TargetOptions &options) const {
+  if (!module)
+    return std::nullopt;
+  auto gpuMod = dyn_cast<gpu::GPUModuleOp>(module);
+  if (!gpuMod) {
+    module->emitError("expected to be a gpu.module op");
+    return std::nullopt;
+  }
+  gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
+    if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
+      funcOp.setIntelReqdSubGroupSize(16);
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+
+  SpirSerializer serializer(
+      *module, cast<mlir::xevm::XeVMTargetAttr>(attribute), options);
+  serializer.init();
+
+#if !LLVM_HAS_SPIRV_TARGET
+  module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
+                    "without having the target built.");
+#endif
+
+  return serializer.run();
+}
+
+Attribute
+XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
+                                 const SmallVector<char, 0> &object,
+                                 const gpu::TargetOptions &options) const {
+  gpu::CompilationTarget format = options.getCompilationTarget();
+  DictionaryAttr objectProps;
+  Builder builder(attribute.getContext());
+  return builder.getAttr<gpu::ObjectAttr>(
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      objectProps, /*kernels=*/nullptr);
+}
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
index af22a7ff04bf0..9ea5c6835e8ef 100644
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -60,6 +60,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
   MLIRROCDLToLLVMIRTranslation
   MLIRSPIRVToLLVMIRTranslation
   MLIRVCIXToLLVMIRTranslation
+  MLIRXeVMToLLVMIRTranslation
   )
 
 add_mlir_translation_library(MLIRTargetLLVMIRImport
diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
index f030fa78942d5..86c731a1074c3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(OpenMP)
 add_subdirectory(ROCDL)
 add_subdirectory(SPIRV)
 add_subdirectory(VCIX)
+add_subdirectory(XeVM)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
new file mode 100644
index 0000000000000..6308d7e2e4404
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_OPTIONAL_SOURCES
+  XeVMToLLVMIRTranslation.cpp
+)
+
+add_mlir_translation_library(MLIRXeVMToLLVMIRTranslation
+  XeVMToLLVMIRTranslation.cpp
+
+  DEPENDS
+  MLIRXeVMConversionsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRDialectUtils
+  MLIRIR
+  MLIRLLVMDialect
+  MLIRXeVMDialect
+  MLIRSupport
+  MLIRTargetLLVMIRExport
+)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
new file mode 100644
index 0000000000000..f961d25ff1b86
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
@@ -0,0 +1,117 @@
+//===-- XeVMToLLVMIRTranslation.cpp - Translate XeVM to LLVM IR -*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR XeVM dialect and
+// LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+namespace {
+/// Implementation of the dialect interface that converts operations belonging
+/// to the XeVM dialect to LLVM IR.
+class XeVMDialectLLVMIRTranslationInterface
+    : public LLVMTranslationDialectInterface {
+public:
+  using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+  /// Translates the given operation to LLVM IR using the provided IR builder
+  /// and saving the state in `moduleTranslation`.
+  LogicalResult
+  convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+                   LLVM::ModuleTranslation &moduleTranslation) const final {
+    /* TODO */
+    return failure();
+  }
+
+  /// Attaches module-level metadata for functions marked as kernels.
+  LogicalResult
+  amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
+                 NamedAttribute attribute,
+                 LLVM::ModuleTranslation &moduleTranslation) const final {
+    StringRef attrName = attribute.getName().getValue();
+    if (attrName == mlir::xevm::XeVMDialect::getCacheControlsAttrName()) {
+      auto cacheControlsArray = dyn_cast<ArrayAttr>(attribute.getValue());
+      if (cacheControlsArray.size() != 2) {
+        return op->emitOpError(
+            "Expected both L1 and L3 cache control attributes!");
+      }
+      if (instructions.size() != 1) {
+        return op->emitOpError("Expecting a single instruction");
+      }
+      return handleDecorationCacheControl(instructions.front(),
+                                          cacheControlsArray.getValue());
+    }
+    auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+    if (!func)
+      return failure();
+
+    return success();
+  }
+
+private:
+  template <typename IntTy>
+  static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) {
+    return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val));
+  }
+
+  static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst,
+                                                    ArrayRef<Attribute> attrs) {
+    SmallVector<llvm::Metadata *> decorations;
+    llvm::LLVMContext &ctx = inst->getContext();
+    llvm::Type *i32Ty = llvm::IntegerType::getInt32Ty(ctx);
+    llvm::transform(attrs, std::back_inserter(decorations),
+                    [&ctx, i32Ty](Attribute attr) -> llvm::Metadata * {
+                      auto valuesArray = dyn_cast<ArrayAttr>(attr).getValue();
+                      std::array<llvm::Metadata *, 4> metadata;
+                      llvm::transform(
+                          valuesArray, metadata.begin(),
+                          [i32Ty](Attribute valueAttr) {
+                            return getConstantIntMD(
+                                i32Ty, cast<IntegerAttr>(valueAttr).getValue());
+                          });
+                      return llvm::MDNode::get(ctx, metadata);
+                    });
+    constexpr llvm::StringLiteral decorationCacheControlMDName =
+        "spirv.DecorationCacheControlINTEL";
+    inst->setMetadata(decorationCacheControlMDName,
+                      llvm::MDNode::get(ctx, decorations));
+    return success();
+  }
+};
+} // namespace
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry &registry) {
+  registry.insert<xevm::XeVMDialect>();
+  registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) {
+    dialect->addInterfaces<XeVMDialectLLVMIRTranslationInterface>();
+  });
+}
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) {
+  DialectRegistry registry;
+  registerXeVMDialectTranslation(registry);
+  context.appendDialectRegistry(registry);
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
new file mode 100644
index 0000000000000..d172445e6ee54
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+    config.unsupported = True
+if not config.enable_sycl_runner:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
new file mode 100644
index 0000000000000..07bd15a35083e
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
@@ -0,0 +1,135 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    // - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size.
+    //   The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics.
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
+
+    gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel {
+      %base_width_a = arith.constant 32 : i32
+      %base_height_a = arith.constant 8 : i32
+      %base_pitch_a = arith.constant 32 : i32
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+
+      %base_width_b = arith.constant 32 : i32
+      %base_height_b = arith.constant 16 : i32
+      %base_pitch_b = arith.constant 32 : i32
+      %loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+      %loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32>
+
+      %base_width_c = arith.constant 64 : i32
+      %base_height_c = arith.constant 8 : i32
+      %base_pitch_c = arith.constant 64 : i32
+      %loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+
+      %loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32>
+      %c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted {shape=<m=8, n=16, k=16>, types=<d=f32, a=f16, b=f16, c=f32>} : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
+      %c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32>
+
+      xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+  func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+
+    %memref_a = gpu.alloc() : memref<8x16xf16>
+    gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
+    %a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index
+    %a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
+    %a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
+    %a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_b = gpu.alloc() : memref<16x16xf16>
+    gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
+    %b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index
+    %b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64
+    %b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr
+    %b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_c = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
+    %c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index
+    %c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64
+    %c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr
+    %c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_a : memref<8x16xf16>
+    gpu.dealloc %memref_b : memref<16x16xf16>
+    %res = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_c : memref<8x16xf32>
+    return %res : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf16>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %row_idx = arith.index_cast %i : index to i32
+        %row = arith.sitofp %row_idx : i32 to f16
+        memref.store %row, %A[%i, %j] : memref<8x16xf16>
+      }
+    }
+    %B = memref.alloc() : memref<16x16xf16>
+    scf.for %i = %c0 to %c16 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %col_idx = arith.index_cast %j : index to i32
+        %col = arith.sitofp %col_idx : i32 to f16
+        memref.store %col, %B[%i, %j] : memref<16x16xf16>
+      }
+    }
+
+    %C = memref.alloc() : memref<8x16xf32>
+    %c0_f16 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32>
+    %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]
+    // CHECK-NEXT: [0,   16,   32,   48,   64,   80,   96,   112,   128,   144,   160,   176,   192,   208,   224,   240]
+    // CHECK-NEXT: [0,   32,   64,   96,   128,   160,   192,   224,   256,   288,   320,   352,   384,   416,   448,   480]
+    // CHECK-NEXT: [0,   48,   96,   144,   192,   240,   288,   336,   384,   432,   480,   528,   576,   624,   672,   720]
+    // CHECK-NEXT: [0,   64,   128,   192,   256,   320,   384,   448,   512,   576,   640,   704,   768,   832,   896,   960]
+    // CHECK-NEXT: [0,   80,   160,   240,   320,   400,   480,   560,   640,   720,   800,   880,   960,   1040,   1120,   1200]
+    // CHECK-NEXT: [0,   96,   192,   288,   384,   480,   576,   672,   768,   864,   960,   1056,   1152,   1248,   1344,   1440]
+    // CHECK-NEXT: [0,   112,   224,   336,   448,   560,   672,   784,   896,   1008,   1120,   1232,   1344,   1456,   1568,   1680]
+
+    memref.dealloc %A : memref<8x16xf16>
+    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %C : memref<8x16xf32>
+    memref.dealloc %C_res : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
new file mode 100644
index 0000000000000..3efb43bd0e426
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
@@ -0,0 +1,103 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+  gpu.module @kernel {
+    // - `cl_intel_subgroups` block load/store intrinsics operate at the *maximum* sub-group size,
+    //     regardless of the active sub-group size. Make sure `clGetKernelSubGroupInfo` meets your expectations.
+    // - The attribute `intel_reqd_sub_group_size` establishes the maximum sub-group size for a kernel.
+    //
+    // Note: launching 16 threads without explicit `intel_reqd_sub_group_size = 16` may still use
+    //       the default sub-group size of 32.
+    //
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_required_subgroup_size.html
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html
+
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 64 : i32 // bytewidth of the block
+      %base_height = arith.constant 8 : i32 // number of rows
+      %base_pitch = arith.constant 64 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      // If `intel_reqd_sub_group_size = 16` is not set, the default (32) is used and this `blockload2d` would only load 4 elements into vector<8xi32>
+      %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+      %c0 = arith.constant 0 : i32
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+      %loaded_f32_modified = vector.insertelement %thread_x_f32, %loaded_f32[%c0 : i32] : vector<8xf32>
+      %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+      xevm.blockstore2d %dst, %base_width, %base_height, %base_pitch, %x, %y, %loaded_modified <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+  func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_src, %src : memref<8x16xf32>, memref<8x16xf32>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<8x16xf32> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<8x16xf32>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<8x16xf32>
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_dst : memref<8x16xf32>
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.11{{.*}}]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    memref.dealloc %A : memref<8x16xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
new file mode 100644
index 0000000000000..88d2e90794fb0
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 32 : i32 // bytewidth of the block
+      %base_height_load = arith.constant 16 : i32 // number of rows
+      %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+
+      // Consider the following two loads:
+      // Normal load:
+      %loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+      %loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16>
+      %loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+
+      // Register packed load:
+      %loaded_packed = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_packed_f16_flat = vector.bitcast %loaded_packed : vector<8xi32> to vector<16xf16>
+      %loaded_packed_f16 = vector.shape_cast %loaded_packed_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+      // Both can be represented the same way in code as vector<16xf16>.
+      // A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register.
+      // Packed load "packs" multiple sub-dword values along the column (↓), allowing a single register to hold multiple values.
+      //  In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded by packing to register is logically equivalent to the sequence of values loaded using a normal load.
+      // The load results of both methods can have the same logical representation, but are expected to differ in physical layout and register efficiency.
+
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16
+      %loaded_f16_modified = vector.insert %thread_x_f16, %loaded_packed_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_packed_f16 and loaded_f16 can be used here
+      // We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores
+      %loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified {offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+      %loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16>
+      %base_height_store = arith.constant 8 : i32 // number of rows
+      %base_width_store = arith.constant 32 : i32 // bytewidth of the block
+      %base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row
+      xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+
+      %loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified {offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+      %loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16>
+
+      %second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32
+      %second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8
+      xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+      gpu.return
+    }
+  }
+
+
+  func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<16x16xf16>
+    gpu.memcpy %memref_src, %src : memref<16x16xf16>, memref<16x16xf16>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<16x16xf16>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<16x16xf16>
+    %dst = memref.alloc() : memref<16x16xf16>
+    gpu.memcpy %dst, %memref_dst : memref<16x16xf16>, memref<16x16xf16>
+    gpu.dealloc %memref_dst : memref<16x16xf16>
+    return %dst : memref<16x16xf16>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<16x16xf16>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 16 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.1 : f16
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16>
+      }
+    }
+    %B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16>
+    %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+    %A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16>
+    call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
+    call @printMemrefF16(%B_cast) : (memref<*xf16>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.1{{.*}}]
+    // CHECK-COUNT-224: 11.1
+    // CHECK-NEXT: [11.1{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.1{{.*}}]
+    // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    // CHECK-COUNT-208: 11.1
+    // CHECK-NEXT: [11.1{{.*}}]
+
+    memref.dealloc %A : memref<16x16xf16>
+    memref.dealloc %B : memref<16x16xf16>
+    return
+  }
+  func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
new file mode 100644
index 0000000000000..646d1fc9b4666
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 32 : i32 // bytewidth of the block
+      %base_height = arith.constant 16 : i32 // number of rows
+      %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      // Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item loads a horizontal slice (→).
+      // The tile dimension we want to slice must be a multiple of the sub-group size: e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0.
+      %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=true, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+
+      %c0 = arith.constant 0 : i32
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+      %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored
+      %loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+
+      %base_height_store = arith.constant 8 : i32 // number of rows
+      %base_width_store = arith.constant 64 : i32 // bytewidth of the block
+      %base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row
+      // "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓).
+      xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+
+  func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<16x8xf32>
+    gpu.memcpy %memref_src, %src : memref<16x8xf32>, memref<16x8xf32>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<8x16xf32>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<16x8xf32>
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_dst : memref<8x16xf32>
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<16x8xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f16
+    scf.for %i = %c0 to %c16 step %c1 {
+      scf.for %j = %c0 to %c8 step %c1 {
+        %c_10_f = arith.constant 10.0 : f32
+        %j_i64 = arith.index_cast %j : index to i64
+        %j_i32 = llvm.trunc %j_i64 : i64 to i32
+        %j_f32 = arith.sitofp %j_i32 : i32 to f32
+        %jj = arith.divf %j_f32, %c_10_f : f32
+
+        %i_i64 = arith.index_cast %i : index to i64
+        %i_i32 = llvm.trunc %i_i64 : i64 to i32
+        %i_f32 = arith.sitofp %i_i32 : i32 to f32
+        %ii = arith.addf %i_f32, %jj : f32
+        memref.store %ii, %A[%i, %j] : memref<16x8xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32>
+    %A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   0.1,   0.2,   0.3,   0.4,   0.5,   0.6,   0.7],
+    // CHECK-NEXT: [1,   1.1,   1.2,   1.3,   1.4,   1.5,   1.6,   1.7],
+    // CHECK-NEXT: [2,   2.1,   2.2,   2.3,   2.4,   2.5,   2.6,   2.7],
+    // CHECK-NEXT: [3,   3.1,   3.2,   3.3,   3.4,   3.5,   3.6,   3.7],
+    // CHECK-NEXT: [4,   4.1,   4.2,   4.3,   4.4,   4.5,   4.6,   4.7],
+    // CHECK-NEXT: [5,   5.1,   5.2,   5.3,   5.4,   5.5,   5.6,   5.7],
+    // CHECK-NEXT: [6,   6.1,   6.2,   6.3,   6.4,   6.5,   6.6,   6.7],
+    // CHECK-NEXT: [7,   7.1,   7.2,   7.3,   7.4,   7.5,   7.6,   7.7],
+    // CHECK-NEXT: [8,   8.1,   8.2,   8.3,   8.4,   8.5,   8.6,   8.7],
+    // CHECK-NEXT: [9,   9.1,   9.2,   9.3,   9.4,   9.5,   9.6,   9.7],
+    // CHECK-NEXT: [10,   10.1,   10.2,   10.3,   10.4,   10.5,   10.6,   10.7],
+    // CHECK-NEXT: [11,   11.1,   11.2,   11.3,   11.4,   11.5,   11.6,   11.7],
+    // CHECK-NEXT: [12,   12.1,   12.2,   12.3,   12.4,   12.5,   12.6,   12.7],
+    // CHECK-NEXT: [13,   13.1,   13.2,   13.3,   13.4,   13.5,   13.6,   13.7],
+    // CHECK-NEXT: [14,   14.1,   14.2,   14.3,   14.4,   14.5,   14.6,   14.7],
+    // CHECK-NEXT: [15,   15.1,   15.2,   15.3,   15.4,   15.5,   15.6,   15.7]
+
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15],
+    // CHECK-NEXT: [0.1,   1.1,   2.1,   3.1,   4.1,   5.1,   6.1,   7.1,   8.1,   9.1,   10.1,   11.1,   12.1,   13.1,   14.1,   15.1],
+    // CHECK-NEXT: [0.2,   1.2,   2.2,   3.2,   4.2,   5.2,   6.2,   7.2,   8.2,   9.2,   10.2,   11.2,   12.2,   13.2,   14.2,   15.2],
+    // CHECK-NEXT: [0.3,   1.3,   2.3,   3.3,   4.3,   5.3,   6.3,   7.3,   8.3,   9.3,   10.3,   11.3,   12.3,   13.3,   14.3,   15.3],
+    // CHECK-NEXT: [0.4,   1.4,   2.4,   3.4,   4.4,   5.4,   6.4,   7.4,   8.4,   9.4,   10.4,   11.4,   12.4,   13.4,   14.4,   15.4],
+    // CHECK-NEXT: [0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,   9.5,   10.5,   11.5,   12.5,   13.5,   14.5,   15.5],
+    // CHECK-NEXT: [0.6,   1.6,   2.6,   3.6,   4.6,   5.6,   6.6,   7.6,   8.6,   9.6,   10.6,   11.6,   12.6,   13.6,   14.6,   15.6],
+    // CHECK-NEXT: [0.7,   1.7,   2.7,   3.7,   4.7,   5.7,   6.7,   7.7,   8.7,   9.7,   10.7,   11.7,   12.7,   13.7,   14.7,   15.7]
+
+    memref.dealloc %A : memref<16x8xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
new file mode 100644
index 0000000000000..7ead3577857f5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+  gpu.module @kernel {
+    gpu.func @store_constant(%ptr: !llvm.ptr<1>) kernel {
+      %const_val = arith.constant 42.0 : f32
+      %thread_x = gpu.lane_id
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %ptr_next_1 = llvm.getelementptr %ptr[%thread_x_i64] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, i32
+      llvm.store %const_val, %ptr_next_1 : f32, !llvm.ptr<1>
+      gpu.return
+    }
+  }
+  func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %memref_0 = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_0, %src : memref<8x16xf32>, memref<8x16xf32>
+    %0 = memref.extract_aligned_pointer_as_index %memref_0 : memref<8x16xf32> -> index
+    %1 = arith.index_cast %0 : index to i64
+    %2 = llvm.inttoptr %1 : i64 to !llvm.ptr
+    %src_casted = llvm.addrspacecast %2 : !llvm.ptr to !llvm.ptr<1>
+    gpu.launch_func @kernel::@store_constant blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_casted : !llvm.ptr<1>)
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_0 : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_0 : memref<8x16xf32>
+
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.11{{.*}}]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    memref.dealloc %A : memref<8x16xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
index 418c884dc03b3..882d5abc2eeb8 100644
--- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
@@ -30,6 +30,7 @@ set(LIBS
   MLIRVectorDialect
   MLIRVectorToLLVMPass
   MLIRXeVMDialect
+  MLIRXeVMToLLVMIRTranslation
   )
 
 add_mlir_library(MLIRGPUTestPasses
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 132aabe135940..08dc98938a31a 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -32,6 +32,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.run_xevm_tests = @MLIR_ENABLE_XEVM_CONVERSIONS@
 config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@

>From 717bdf9970544a4b2eb87dc861679ed8f6dba74c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 25 Jul 2025 19:51:52 +0000
Subject: [PATCH 2/9] Add XeVM to LLVMIR translation.

---
 mlir/include/mlir/Target/LLVMIR/Dialect/All.h |   3 +
 .../Dialect/XeVM/XeVMToLLVMIRTranslation.h    |  31 +++++
 mlir/lib/Target/LLVMIR/CMakeLists.txt         |   1 +
 mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt |   1 +
 .../Target/LLVMIR/Dialect/XeVM/CMakeLists.txt |  21 ++++
 .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp  | 108 ++++++++++++++++++
 mlir/test/Target/LLVMIR/xevm.mlir             | 101 ++++++++++++++++
 7 files changed, 266 insertions(+)
 create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
 create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
 create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
 create mode 100644 mlir/test/Target/LLVMIR/xevm.mlir

diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 60615cf601655..e4670cb1a9622 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -28,6 +28,7 @@
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
 
 namespace mlir {
 class DialectRegistry;
@@ -47,6 +48,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry &registry) {
   registerROCDLDialectTranslation(registry);
   registerSPIRVDialectTranslation(registry);
   registerVCIXDialectTranslation(registry);
+  registerXeVMDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
@@ -63,6 +65,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry &registry) {
   registerNVVMDialectTranslation(registry);
   registerROCDLDialectTranslation(registry);
   registerSPIRVDialectTranslation(registry);
+  registerXeVMDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
new file mode 100644
index 0000000000000..b4f6750718fe8
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
@@ -0,0 +1,31 @@
+//===-- XeVMToLLVMIRTranslation.h - XeVM to LLVM IR -------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for XeVM dialect to LLVM IR translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+
+namespace mlir {
+
+class DialectRegistry;
+class MLIRContext;
+
+/// Register the XeVM dialect and the translation from it to the LLVM IR in the
+/// given registry;
+void registerXeVMDialectTranslation(mlir::DialectRegistry &registry);
+
+/// Register the XeVM dialect and the translation from it in the registry
+/// associated with the given context.
+void registerXeVMDialectTranslation(mlir::MLIRContext &context);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
index af22a7ff04bf0..9ea5c6835e8ef 100644
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -60,6 +60,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
   MLIRROCDLToLLVMIRTranslation
   MLIRSPIRVToLLVMIRTranslation
   MLIRVCIXToLLVMIRTranslation
+  MLIRXeVMToLLVMIRTranslation
   )
 
 add_mlir_translation_library(MLIRTargetLLVMIRImport
diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
index f030fa78942d5..86c731a1074c3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(OpenMP)
 add_subdirectory(ROCDL)
 add_subdirectory(SPIRV)
 add_subdirectory(VCIX)
+add_subdirectory(XeVM)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
new file mode 100644
index 0000000000000..6308d7e2e4404
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_OPTIONAL_SOURCES
+  XeVMToLLVMIRTranslation.cpp
+)
+
+add_mlir_translation_library(MLIRXeVMToLLVMIRTranslation
+  XeVMToLLVMIRTranslation.cpp
+
+  DEPENDS
+  MLIRXeVMConversionsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRDialectUtils
+  MLIRIR
+  MLIRLLVMDialect
+  MLIRXeVMDialect
+  MLIRSupport
+  MLIRTargetLLVMIRExport
+)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
new file mode 100644
index 0000000000000..67ecb53ca4b3b
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
@@ -0,0 +1,108 @@
+//===-- XeVMToLLVMIRTranslation.cpp - Translate XeVM to LLVM IR -*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR XeVM dialect and
+// LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+namespace {
+/// Implementation of the dialect interface that converts operations belonging
+/// to the XeVM dialect to LLVM IR.
+class XeVMDialectLLVMIRTranslationInterface
+    : public LLVMTranslationDialectInterface {
+public:
+  using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+  /// Attaches module-level metadata for functions marked as kernels.
+  LogicalResult
+  amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
+                 NamedAttribute attribute,
+                 LLVM::ModuleTranslation &moduleTranslation) const final {
+    StringRef attrName = attribute.getName().getValue();
+    if (attrName == mlir::xevm::XeVMDialect::getCacheControlsAttrName()) {
+      auto cacheControlsArray = dyn_cast<ArrayAttr>(attribute.getValue());
+      if (cacheControlsArray.size() != 2) {
+        return op->emitOpError(
+            "Expected both L1 and L3 cache control attributes!");
+      }
+      if (instructions.size() != 1) {
+        return op->emitOpError("Expecting a single instruction");
+      }
+      return handleDecorationCacheControl(instructions.front(),
+                                          cacheControlsArray.getValue());
+    }
+    auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+    if (!func)
+      return failure();
+
+    return success();
+  }
+
+private:
+  template <typename IntTy>
+  static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) {
+    return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val));
+  }
+
+  static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst,
+                                                    ArrayRef<Attribute> attrs) {
+    SmallVector<llvm::Metadata *> decorations;
+    llvm::LLVMContext &ctx = inst->getContext();
+    llvm::Type *i32Ty = llvm::IntegerType::getInt32Ty(ctx);
+    llvm::transform(
+        attrs, std::back_inserter(decorations),
+        [&ctx, i32Ty](Attribute attr) -> llvm::Metadata * {
+          auto valuesArray = dyn_cast<ArrayAttr>(attr).getValue();
+          std::array<llvm::Metadata *, 4> metadata;
+          llvm::transform(
+              valuesArray, metadata.begin(), [i32Ty](Attribute valueAttr) {
+                return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                    i32Ty, cast<IntegerAttr>(valueAttr).getValue()));
+              });
+          return llvm::MDNode::get(ctx, metadata);
+        });
+    constexpr llvm::StringLiteral decorationCacheControlMDName =
+        "spirv.DecorationCacheControlINTEL";
+    inst->setMetadata(decorationCacheControlMDName,
+                      llvm::MDNode::get(ctx, decorations));
+    return success();
+  }
+};
+} // namespace
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry &registry) {
+  registry.insert<xevm::XeVMDialect>();
+  registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) {
+    dialect->addInterfaces<XeVMDialectLLVMIRTranslationInterface>();
+  });
+}
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) {
+  DialectRegistry registry;
+  registerXeVMDialectTranslation(registry);
+  context.appendDialectRegistry(registry);
+}
diff --git a/mlir/test/Target/LLVMIR/xevm.mlir b/mlir/test/Target/LLVMIR/xevm.mlir
new file mode 100644
index 0000000000000..c71c235233ad5
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/xevm.mlir
@@ -0,0 +1,101 @@
+// RUN: mlir-translate --split-input-file -mlir-to-llvmir %s | FileCheck %s
+
+module {
+  llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
+  llvm.func @blockload2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) -> vector<8xi16> {
+    %0 = llvm.mlir.undef : vector<2xi32>
+    %1 = llvm.mlir.constant(0 : i32) : i32
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
+    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
+    %5 = llvm.mlir.constant(8 : i32) : i32
+    %6 = llvm.alloca %5 x i16 : (i32) -> !llvm.ptr
+    // CHECK-LABEL: call spir_func void @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt
+    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
+    llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, %4, %6)
+      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage<external>, no_unwind,
+       sym_name = "_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64, will_return,
+       xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
+      : (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
+    %7 = llvm.load %6 : !llvm.ptr -> vector<8xi16>
+    llvm.return %7 : vector<8xi16>
+  }
+}
+
+// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
+// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0}
+// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0}
+
+// -----
+module {
+  llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) attributes {no_unwind, will_return}
+  llvm.func @blockstore2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi32>) {
+    %0 = llvm.mlir.undef : vector<2xi32>
+    %1 = llvm.mlir.constant(0 : i32) : i32
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
+    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
+    %5 = llvm.mlir.constant(8 : i32) : i32
+    %6 = llvm.alloca %5 x i32 : (i32) -> !llvm.ptr
+    llvm.store %arg6, %6 : vector<8xi32>, !llvm.ptr
+    // CHECK-LABEL: call spir_func void @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj
+    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
+    llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, %4, %6)
+      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage<external>, no_unwind,
+       sym_name = "_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64, will_return,
+       xevm.DecorationCacheControl = [[6443 : i32, 0 : i32, 2 : i32, 0 : i32], [6443 : i32, 1 : i32, 2 : i32, 0 : i32]]}
+      : (!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) -> ()
+    llvm.return
+  }
+}
+
+// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
+// CHECK: ![[DECO2]] = !{i32 6443, i32 0, i32 2, i32 0}
+// CHECK: ![[DECO3]] = !{i32 6443, i32 1, i32 2, i32 0}
+
+// -----
+module {
+  llvm.func spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+  llvm.func @blockprefetch2d(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+    %0 = llvm.mlir.undef : vector<2xi32>
+    %1 = llvm.mlir.constant(0 : i32) : i32
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
+    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
+    // CHECK-LABEL: call spir_func void @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i
+    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
+    llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(%arg0, %arg1, %arg2, %arg3, %4)
+      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>)>, linkage = #llvm.linkage<external>,
+       memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind,
+       sym_name = "_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i", visibility_ = 0 : i64,
+       xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
+      : (!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) -> ()
+    llvm.return
+  }
+}
+
+// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
+// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0}
+// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0}
+
+// -----
+module {
+  llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+  llvm.func @prefetch(%arg0: !llvm.ptr<1>) {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    // CHECK-LABEL: call spir_func void @_Z8prefetchPU3AS1Kcm
+    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
+    llvm.call spir_funccc @_Z8prefetchPU3AS1Kcm(%arg0, %0)
+      {function_type = !llvm.func<void (ptr<1>, i64)>, linkage = #llvm.linkage<external>,
+       memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>,
+       no_unwind, sym_name = "_Z8prefetchPU3AS1Kcm", visibility_ = 0 : i64,
+       xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
+      : (!llvm.ptr<1>, i64) -> ()
+    llvm.return
+  }
+}
+
+// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
+// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0}
+// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0}
+

>From 6fef03fd48437168700b37ccd4607eeec6048024 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Mon, 28 Jul 2025 17:08:42 +0000
Subject: [PATCH 3/9] Address reviewer comments.

---
 .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp  |  9 +-
 mlir/test/Target/LLVMIR/xevm.mlir             | 82 +------------------
 2 files changed, 3 insertions(+), 88 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
index 67ecb53ca4b3b..73b166d045d5b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
@@ -63,11 +63,6 @@ class XeVMDialectLLVMIRTranslationInterface
   }
 
 private:
-  template <typename IntTy>
-  static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) {
-    return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val));
-  }
-
   static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst,
                                                     ArrayRef<Attribute> attrs) {
     SmallVector<llvm::Metadata *> decorations;
@@ -94,14 +89,14 @@ class XeVMDialectLLVMIRTranslationInterface
 };
 } // namespace
 
-void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry &registry) {
+void mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry &registry) {
   registry.insert<xevm::XeVMDialect>();
   registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) {
     dialect->addInterfaces<XeVMDialectLLVMIRTranslationInterface>();
   });
 }
 
-void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) {
+void mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) {
   DialectRegistry registry;
   registerXeVMDialectTranslation(registry);
   context.appendDialectRegistry(registry);
diff --git a/mlir/test/Target/LLVMIR/xevm.mlir b/mlir/test/Target/LLVMIR/xevm.mlir
index c71c235233ad5..a3dd0b6c17914 100644
--- a/mlir/test/Target/LLVMIR/xevm.mlir
+++ b/mlir/test/Target/LLVMIR/xevm.mlir
@@ -1,93 +1,13 @@
 // RUN: mlir-translate --split-input-file -mlir-to-llvmir %s | FileCheck %s
 
 module {
-  llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
-  llvm.func @blockload2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) -> vector<8xi16> {
-    %0 = llvm.mlir.undef : vector<2xi32>
-    %1 = llvm.mlir.constant(0 : i32) : i32
-    %2 = llvm.mlir.constant(1 : i32) : i32
-    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
-    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
-    %5 = llvm.mlir.constant(8 : i32) : i32
-    %6 = llvm.alloca %5 x i16 : (i32) -> !llvm.ptr
-    // CHECK-LABEL: call spir_func void @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt
-    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
-    llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, %4, %6)
-      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage<external>, no_unwind,
-       sym_name = "_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64, will_return,
-       xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
-      : (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
-    %7 = llvm.load %6 : !llvm.ptr -> vector<8xi16>
-    llvm.return %7 : vector<8xi16>
-  }
-}
-
-// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
-// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0}
-// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0}
-
-// -----
-module {
-  llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) attributes {no_unwind, will_return}
-  llvm.func @blockstore2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi32>) {
-    %0 = llvm.mlir.undef : vector<2xi32>
-    %1 = llvm.mlir.constant(0 : i32) : i32
-    %2 = llvm.mlir.constant(1 : i32) : i32
-    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
-    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
-    %5 = llvm.mlir.constant(8 : i32) : i32
-    %6 = llvm.alloca %5 x i32 : (i32) -> !llvm.ptr
-    llvm.store %arg6, %6 : vector<8xi32>, !llvm.ptr
-    // CHECK-LABEL: call spir_func void @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj
-    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
-    llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, %4, %6)
-      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage<external>, no_unwind,
-       sym_name = "_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64, will_return,
-       xevm.DecorationCacheControl = [[6443 : i32, 0 : i32, 2 : i32, 0 : i32], [6443 : i32, 1 : i32, 2 : i32, 0 : i32]]}
-      : (!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) -> ()
-    llvm.return
-  }
-}
-
-// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
-// CHECK: ![[DECO2]] = !{i32 6443, i32 0, i32 2, i32 0}
-// CHECK: ![[DECO3]] = !{i32 6443, i32 1, i32 2, i32 0}
-
-// -----
-module {
-  llvm.func spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
-  llvm.func @blockprefetch2d(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
-    %0 = llvm.mlir.undef : vector<2xi32>
-    %1 = llvm.mlir.constant(0 : i32) : i32
-    %2 = llvm.mlir.constant(1 : i32) : i32
-    %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32>
-    %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32>
-    // CHECK-LABEL: call spir_func void @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i
-    // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
-    llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(%arg0, %arg1, %arg2, %arg3, %4)
-      {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>)>, linkage = #llvm.linkage<external>,
-       memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind,
-       sym_name = "_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i", visibility_ = 0 : i64,
-       xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
-      : (!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) -> ()
-    llvm.return
-  }
-}
-
-// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]}
-// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0}
-// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0}
-
-// -----
-module {
-  llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+  llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64)
   llvm.func @prefetch(%arg0: !llvm.ptr<1>) {
     %0 = llvm.mlir.constant(1 : i64) : i64
     // CHECK-LABEL: call spir_func void @_Z8prefetchPU3AS1Kcm
     // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]]
     llvm.call spir_funccc @_Z8prefetchPU3AS1Kcm(%arg0, %0)
       {function_type = !llvm.func<void (ptr<1>, i64)>, linkage = #llvm.linkage<external>,
-       memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>,
        no_unwind, sym_name = "_Z8prefetchPU3AS1Kcm", visibility_ = 0 : i64,
        xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]}
       : (!llvm.ptr<1>, i64) -> ()

>From 6e160dfde82aca85aa12aa22ed4abe551f40b204 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Jul 2025 00:48:40 +0000
Subject: [PATCH 4/9] Temp save.

---
 mlir/include/mlir/Target/LLVM/XeVM/Utils.h |  15 +-
 mlir/lib/Target/LLVM/XeVM/Target.cpp       | 295 ++++++++++++++++-----
 2 files changed, 237 insertions(+), 73 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
index c11a97f0d960a..95764fa99e0fc 100644
--- a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
@@ -15,6 +15,7 @@
 
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/Target/LLVM/ModuleToObject.h"
 
 namespace mlir {
@@ -22,16 +23,24 @@ namespace xevm {
 
 /// Base class for all XeVM serializations from GPU modules into binary strings.
 /// By default this class serializes into LLVM bitcode.
-class SerializeGPUModuleBase : public mlir::LLVM::ModuleToObject {
+class SerializeGPUModuleBase : public LLVM::ModuleToObject {
 public:
-  SerializeGPUModuleBase(mlir::Operation &module, XeVMTargetAttr target,
-                         const mlir::gpu::TargetOptions &targetOptions = {});
+  SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target,
+                         const gpu::TargetOptions &targetOptions = {});
 
   static void init();
   XeVMTargetAttr getTarget() const;
 
+  /// Loads the bitcode files in `librariesToLink`.
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::Module &module) override;
+
 protected:
   XeVMTargetAttr target;
+  /// List of LLVM bitcode to link into after translation to LLVM IR.
+  /// The attributes can be StringAttr pointing to a file path, or
+  /// a Resource blob pointing to the LLVM bitcode in-memory.
+  SmallVector<Attribute> librariesToLink;
 };
 } // namespace xevm
 } // namespace mlir
diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
index 380e2bff222ca..85d8405d20888 100644
--- a/mlir/lib/Target/LLVM/XeVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -1,12 +1,12 @@
-//===-- Target.cpp - MLIR LLVM XeVM target compilation ----------*- C++ -*-===//
+//===- Target.cpp - MLIR LLVM XeVM target compilation -----------*- C++ -*-===//
 //
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines XeVM target related functions including registration
+// This files defines XeVM target related functions including registration
 // calls for the `#xevm.target` compilation attribute.
 //
 //===----------------------------------------------------------------------===//
@@ -15,41 +15,36 @@
 
 #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
-#include "mlir/IR/ExtensibleDialect.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
 #include "mlir/Target/LLVM/XeVM/Utils.h"
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Target/TargetMachine.h"
 
 #include "llvm/Config/Targets.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-
-// FIXME: One of the headers uses `.inc` file from the build directory, this
-// does not work for installation (i.e., DCMAKE_INSTALL_PREFIX) caching as build
-// directory will not be cached. Since float atomics are not yet supported by
-// the backend anyway, we can afford to temporarily comment this section.
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 
-// #if LLVM_HAS_SPIRV_TARGET
-// #pragma GCC diagnostic push
-// #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
-// #include "SPIRVTargetMachine.h"
-// #pragma GCC diagnostic pop
-
-// #include "SPIRVCommandLine.h"
-// #endif // LLVM_HAS_SPIRV_TARGET
-
-#include <set>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace mlir;
+using namespace mlir::xevm;
 
 namespace {
 // XeVM implementation of the gpu:TargetAttrInterface.
@@ -68,10 +63,9 @@ class XeVMTargetAttrImpl
 
 void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
     DialectRegistry &registry) {
-  registry.addExtension(
-      +[](MLIRContext *ctx, mlir::xevm::XeVMDialect *dialect) {
-        mlir::xevm::XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
-      });
+  registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
+    XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
+  });
 }
 
 void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
@@ -81,13 +75,17 @@ void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
   context.appendDialectRegistry(registry);
 }
 
-mlir::xevm::SerializeGPUModuleBase::SerializeGPUModuleBase(
-    Operation &module, mlir::xevm::XeVMTargetAttr target,
+SerializeGPUModuleBase::SerializeGPUModuleBase(
+    Operation &module, XeVMTargetAttr target,
     const gpu::TargetOptions &targetOptions)
     : ModuleToObject(module, target.getTriple(), "", {}, target.getO()),
-      target(target) {}
+      target(target), librariesToLink(targetOptions.getLibrariesToLink()) {
+  if (target.getLinkFiles())
+    librariesToLink.append(target.getLinkFiles().begin(),
+                           target.getLinkFiles().end());
+}
 
-void mlir::xevm::SerializeGPUModuleBase::init() {
+void SerializeGPUModuleBase::init() {
   static llvm::once_flag initializeBackendOnce;
   llvm::call_once(initializeBackendOnce, []() {
 #if LLVM_HAS_SPIRV_TARGET
@@ -99,24 +97,40 @@ void mlir::xevm::SerializeGPUModuleBase::init() {
   });
 }
 
-mlir::xevm::XeVMTargetAttr
-mlir::xevm::SerializeGPUModuleBase::getTarget() const {
-  return target;
+XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }
+
+std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
+  if (librariesToLink.empty())
+    return SmallVector<std::unique_ptr<llvm::Module>>();
+  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
+                                      bcFiles)))
+    return std::nullopt;
+  return std::move(bcFiles);
 }
 
 namespace {
-class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase {
+class SpirSerializer : public SerializeGPUModuleBase {
 public:
-  SpirSerializer(Operation &module, mlir::xevm::XeVMTargetAttr target,
+  SpirSerializer(Operation &module, XeVMTargetAttr target,
                  const gpu::TargetOptions &targetOptions)
-      : mlir::xevm::SerializeGPUModuleBase(module, target, targetOptions) {}
+      : SerializeGPUModuleBase(module, target, targetOptions),
+        targetOptions(targetOptions) {}
 
   gpu::GPUModuleOp getOperation();
 
+  /// Serializes the LLVM module to an object format, depending on the
+  /// compilation target selected in target options.
   std::optional<SmallVector<char, 0>>
   moduleToObject(llvm::Module &llvmModule) override;
 
+  /// Compiles to native code using `ocloc`.
+  std::optional<SmallVector<char, 0>> compileToBinary(const std::string &asmStr,
+                                                      StringRef inputFormat);
+
 private:
+  std::optional<std::string> findTool(StringRef tool);
   std::optional<std::string>
   translateToSPIRVBinary(llvm::Module &llvmModule,
                          llvm::TargetMachine &targetMachine);
@@ -125,19 +139,27 @@ class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase {
 } // namespace
 
 gpu::GPUModuleOp SpirSerializer::getOperation() {
-  return dyn_cast<gpu::GPUModuleOp>(
-      &mlir::xevm::SerializeGPUModuleBase::getOperation());
+  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
 }
 
 std::optional<SmallVector<char, 0>>
 SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
+#define DEBUG_TYPE "serialize-to-llvm"
+  LLVM_DEBUG({
+    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
+                 << "\n";
+    llvm::dbgs() << llvmModule << "\n";
+    llvm::dbgs().flush();
+  });
+#undef DEBUG_TYPE
+
   // Return LLVM IR if the compilation target is `offload`.
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
-    return mlir::xevm::SerializeGPUModuleBase::moduleToObject(llvmModule);
+    return SerializeGPUModuleBase::moduleToObject(llvmModule);
 
 #if !LLVM_HAS_SPIRV_TARGET
-  getOperation()->emitError(
-      "The `SPIRV` target was not built. Please enable it when building LLVM.");
+  getOperation()->emitError("The `SPIRV` target was not built. Please enable "
+                            "it when building LLVM.");
   return std::nullopt;
 #endif // LLVM_HAS_SPIRV_TARGET
 
@@ -145,36 +167,29 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
       getOrCreateTargetMachine();
   if (!targetMachine) {
     getOperation().emitError() << "Target Machine unavailable for triple "
-                               << triple << ", can't compile with LLVM\n";
+                               << triple << ", can't optimize with LLVM\n";
     return std::nullopt;
   }
 
-  //===----------------------------------------------------------------------===//
-  // Workaround to enable spirv extensions that are not added to target machine
-  // by default.
-
-  // FIXME: see fixme comment above SPIRV headers.
-  // #if LLVM_HAS_SPIRV_TARGET
-  //   std::set<llvm::SPIRV::Extension::Extension> AllowedExtIds{
-  //       llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add,
-  //       llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add};
-  //   llvm::SPIRVTargetMachine *STM =
-  //       static_cast<llvm::SPIRVTargetMachine *>(targetMachine.value());
-  //   const_cast<llvm::SPIRVSubtarget *>(STM->getSubtargetImpl())
-  //       ->initAvailableExtensions(AllowedExtIds);
-  // #endif // LLVM_HAS_SPIRV_TARGET
+  std::optional<std::string> serializedISA =
+      translateToISA(llvmModule, **targetMachine);
+  if (!serializedISA) {
+    getOperation().emitError() << "Failed translating the module to ISA."
+                               << triple << ", can't compile with LLVM\n";
+    return std::nullopt;
+  }
 
-  //===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "serialize-to-isa"
+  LLVM_DEBUG({
+    llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() << "\n";
+    llvm::dbgs() << *serializedISA << "\n";
+    llvm::dbgs().flush();
+  });
+#undef DEBUG_TYPE
 
   // Return SPIRV if the compilation target is `assembly`.
   if (targetOptions.getCompilationTarget() ==
       gpu::CompilationTarget::Assembly) {
-    std::optional<std::string> serializedISA =
-        translateToISA(llvmModule, **targetMachine);
-    if (!serializedISA) {
-      getOperation().emitError() << "Failed translating the module to ISA.";
-      return std::nullopt;
-    }
     // Make sure to include the null terminator.
     StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
     return SmallVector<char, 0>(bin.begin(), bin.end());
@@ -194,6 +209,137 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
   return SmallVector<char, 0>(bin.begin(), bin.end());
 }
 
+std::optional<std::string> SpirSerializer::findTool(StringRef tool) {
+  // 1. Check the toolkit path given in the command line.
+  StringRef pathRef = targetOptions.getToolkitPath();
+  SmallVector<char, 256> path;
+  if (!pathRef.empty()) {
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    llvm::sys::path::append(path, "bin", tool);
+    if (llvm::sys::fs::can_execute(path))
+      return StringRef(path.data(), path.size()).str();
+  }
+  // 2. Check PATH.
+  if (std::optional<std::string> toolPath =
+          llvm::sys::Process::FindInEnvPath("PATH", tool))
+    return *toolPath;
+
+  getOperation().emitError()
+      << "Couldn't find the `" << tool
+      << "` binary. Please specify the toolkit "
+         "path via GpuModuleToBinaryPass or add the compiler to $PATH`.";
+  return std::nullopt;
+}
+
+// There is 1 way to finalize SPIR-V to native code: IGC
+// There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime).
+// - L0 runtime consumes SPIR-V and is external to MLIR codebase (rt wrappers).
+// - `ocloc` tool can be "queried" from within MLIR.
+std::optional<SmallVector<char, 0>>
+SpirSerializer::compileToBinary(const std::string &asmStr,
+                                StringRef inputFormat) {
+  using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
+  // Find the `ocloc` tool.
+  std::optional<std::string> oclocCompiler = findTool("ocloc");
+  if (!oclocCompiler)
+    return std::nullopt;
+  Location loc = getOperation().getLoc();
+  std::string basename =
+      llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(),
+                    getTarget().getTriple(), getTarget().getChip());
+
+  auto createTemp = [&](StringRef name,
+                        StringRef suffix) -> std::optional<TmpFile> {
+    llvm::SmallString<128> filePath;
+    if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) {
+      getOperation().emitError()
+          << "Couldn't create the temp file: `" << filePath
+          << "`, error message: " << ec.message();
+      return std::nullopt;
+    }
+    return TmpFile(filePath, llvm::FileRemover(filePath.c_str()));
+  };
+  // Create temp file
+  std::optional<TmpFile> asmFile = createTemp(basename, "asm");
+  std::optional<TmpFile> binFile = createTemp(basename, "");
+  std::optional<TmpFile> logFile = createTemp(basename, "log");
+  if (!logFile || !asmFile || !binFile)
+    return std::nullopt;
+  // Dump the assembly to a temp file
+  std::error_code ec;
+  {
+    llvm::raw_fd_ostream asmStream(asmFile->first, ec);
+    if (ec) {
+      emitError(loc) << "Couldn't open the file: `" << asmFile->first
+                     << "`, error message: " << ec.message();
+      return std::nullopt;
+    }
+    asmStream << asmStr;
+    if (asmStream.has_error()) {
+      emitError(loc) << "An error occurred while writing the assembly to: `"
+                     << asmFile->first << "`.";
+      return std::nullopt;
+    }
+    asmStream.flush();
+  }
+  // Set cmd options
+  std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
+      targetOptions.tokenizeCmdOptions();
+  // Example: --gpu-module-to-binary="opts='opt1 opt2'"
+  const std::string cmdOptsStr = "\"" + llvm::join(cmdOpts.second, " ") + "\"";
+  SmallVector<StringRef, 12> oclocArgs(
+      {"ocloc", "compile", "-file", asmFile->first, inputFormat, "-device",
+       getTarget().getChip(), "-output", binFile->first, "-output_no_suffix",
+       "-options", cmdOptsStr});
+
+// Dump tool invocation commands.
+#define DEBUG_TYPE "serialize-to-binary"
+  LLVM_DEBUG({
+    llvm::dbgs() << "Tool invocation for module: "
+                 << getOperation().getNameAttr() << "\n";
+    llvm::interleave(oclocArgs, llvm::dbgs(), " ");
+    llvm::dbgs() << "\n";
+  });
+#undef DEBUG_TYPE
+  // Helper function for printing tool error logs.
+  std::string message;
+  auto emitLogError =
+      [&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
+    if (message.empty()) {
+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
+          llvm::MemoryBuffer::getFile(logFile->first);
+      if (toolStderr)
+        emitError(loc) << toolName << " invocation failed. Log:\n"
+                       << toolStderr->get()->getBuffer();
+      else
+        emitError(loc) << toolName << " invocation failed.";
+      return std::nullopt;
+    }
+    emitError(loc) << toolName
+                   << " invocation failed, error message: " << message;
+    return std::nullopt;
+  };
+  std::optional<StringRef> redirects[] = {
+      std::nullopt,
+      logFile->first,
+      logFile->first,
+  };
+  // Invoke ocloc.
+  if (llvm::sys::ExecuteAndWait(oclocCompiler.value(), oclocArgs, std::nullopt,
+                                redirects, 0, 0, &message))
+    return emitLogError("`ocloc`");
+  binFile->first.append(".bin");
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
+      llvm::MemoryBuffer::getFile(binFile->first);
+  if (!binaryBuffer) {
+    emitError(loc) << "Couldn't open the file: `" << binFile->first
+                   << "`, error message: " << binaryBuffer.getError().message();
+     return std::nullopt;
+   }
+  StringRef bin = (*binaryBuffer)->getBuffer();
+  return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
 std::optional<std::string>
 SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
                                        llvm::TargetMachine &targetMachine) {
@@ -232,7 +378,7 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   });
 
   SpirSerializer serializer(
-      *module, cast<mlir::xevm::XeVMTargetAttr>(attribute), options);
+      *module, cast<XeVMTargetAttr>(attribute), options);
   serializer.init();
 
 #if !LLVM_HAS_SPIRV_TARGET
@@ -247,9 +393,18 @@ Attribute
 XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
                                  const SmallVector<char, 0> &object,
                                  const gpu::TargetOptions &options) const {
+  Builder builder(attribute.getContext());
   gpu::CompilationTarget format = options.getCompilationTarget();
+  auto target = cast<XeVMTargetAttr>(attribute);
+  SmallVector<NamedAttribute, 2> properties;
+  if (format == gpu::CompilationTarget::Assembly)
+    properties.push_back(
+        builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO())));
+
   DictionaryAttr objectProps;
-  Builder builder(attribute.getContext());
+  if (!properties.empty())
+    objectProps = builder.getDictionaryAttr(properties);
+
   return builder.getAttr<gpu::ObjectAttr>(
       attribute, format,
       builder.getStringAttr(StringRef(object.data(), object.size())),

>From b1f6bc3ad57ceb9cc20662eb691bbdbbc7427740 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Jul 2025 18:04:13 +0000
Subject: [PATCH 5/9] Temp save.

---
 mlir/lib/Target/LLVM/XeVM/Target.cpp | 29 +++++-----------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
index 85d8405d20888..8f95fdc3a8beb 100644
--- a/mlir/lib/Target/LLVM/XeVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -132,8 +132,6 @@ class SpirSerializer : public SerializeGPUModuleBase {
 private:
   std::optional<std::string> findTool(StringRef tool);
   std::optional<std::string>
-  translateToSPIRVBinary(llvm::Module &llvmModule,
-                         llvm::TargetMachine &targetMachine);
   gpu::TargetOptions targetOptions;
 };
 } // namespace
@@ -188,13 +186,14 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
 #undef DEBUG_TYPE
 
   // Return SPIRV if the compilation target is `assembly`.
-  if (targetOptions.getCompilationTarget() ==
-      gpu::CompilationTarget::Assembly) {
+//  if (targetOptions.getCompilationTarget() ==
+//      gpu::CompilationTarget::Assembly) {
     // Make sure to include the null terminator.
     StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
     return SmallVector<char, 0>(bin.begin(), bin.end());
-  }
+//  }
 
+/*
   std::optional<std::string> serializedSPIRVBinary =
       translateToSPIRVBinary(llvmModule, **targetMachine);
   if (!serializedSPIRVBinary) {
@@ -207,6 +206,7 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
   }
   StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
   return SmallVector<char, 0>(bin.begin(), bin.end());
+*/
 }
 
 std::optional<std::string> SpirSerializer::findTool(StringRef tool) {
@@ -340,25 +340,6 @@ SpirSerializer::compileToBinary(const std::string &asmStr,
   return SmallVector<char, 0>(bin.begin(), bin.end());
 }
 
-std::optional<std::string>
-SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
-                                       llvm::TargetMachine &targetMachine) {
-  std::string targetISA;
-  llvm::raw_string_ostream stream(targetISA);
-
-  { // Drop pstream after this to prevent the ISA from being stuck buffering
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegenPasses;
-
-    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                          llvm::CodeGenFileType::ObjectFile))
-      return std::nullopt;
-
-    codegenPasses.run(llvmModule);
-  }
-  return targetISA;
-}
-
 std::optional<SmallVector<char, 0>>
 XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
                                       const gpu::TargetOptions &options) const {

>From 415166e221e7684dd63a147106343ce44a1d1acd Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Jul 2025 22:15:29 +0000
Subject: [PATCH 6/9] Temp save.

---
 mlir/lib/Target/LLVM/XeVM/Target.cpp | 29 +++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
index 8f95fdc3a8beb..4d67e67fa5a56 100644
--- a/mlir/lib/Target/LLVM/XeVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -132,6 +132,8 @@ class SpirSerializer : public SerializeGPUModuleBase {
 private:
   std::optional<std::string> findTool(StringRef tool);
   std::optional<std::string>
+  translateToSPIRVBinary(llvm::Module &llvmModule,
+                         llvm::TargetMachine &targetMachine);
   gpu::TargetOptions targetOptions;
 };
 } // namespace
@@ -186,14 +188,14 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
 #undef DEBUG_TYPE
 
   // Return SPIRV if the compilation target is `assembly`.
-//  if (targetOptions.getCompilationTarget() ==
-//      gpu::CompilationTarget::Assembly) {
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
     // Make sure to include the null terminator.
     StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
     return SmallVector<char, 0>(bin.begin(), bin.end());
-//  }
+  }
+
 
-/*
   std::optional<std::string> serializedSPIRVBinary =
       translateToSPIRVBinary(llvmModule, **targetMachine);
   if (!serializedSPIRVBinary) {
@@ -206,7 +208,24 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
   }
   StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
   return SmallVector<char, 0>(bin.begin(), bin.end());
-*/
+}
+
+std::optional<std::string>
+SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CodeGenFileType::ObjectFile))
+      return std::nullopt;
+
+    codegenPasses.run(llvmModule);
+  }
+  return targetISA;
 }
 
 std::optional<std::string> SpirSerializer::findTool(StringRef tool) {

>From dcefcc6a14d95d772309936e2e666800744c78f7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 31 Jul 2025 22:51:32 +0000
Subject: [PATCH 7/9] Temp save.

---
 mlir/lib/Target/LLVM/XeVM/Target.cpp | 41 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
index 4d67e67fa5a56..8bc934190d0c3 100644
--- a/mlir/lib/Target/LLVM/XeVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Target/TargetMachine.h"
 
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Config/Targets.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
@@ -38,7 +39,6 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
 
 #include <cstdint>
 #include <cstdlib>
@@ -171,31 +171,31 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
     return std::nullopt;
   }
 
-  std::optional<std::string> serializedISA =
-      translateToISA(llvmModule, **targetMachine);
-  if (!serializedISA) {
-    getOperation().emitError() << "Failed translating the module to ISA."
-                               << triple << ", can't compile with LLVM\n";
-    return std::nullopt;
-  }
+  // Return SPIRV if the compilation target is `assembly`.
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
+    std::optional<std::string> serializedISA =
+        translateToISA(llvmModule, **targetMachine);
+    if (!serializedISA) {
+      getOperation().emitError() << "Failed translating the module to ISA."
+                                 << triple << ", can't compile with LLVM\n";
+      return std::nullopt;
+    }
 
 #define DEBUG_TYPE "serialize-to-isa"
-  LLVM_DEBUG({
-    llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() << "\n";
-    llvm::dbgs() << *serializedISA << "\n";
-    llvm::dbgs().flush();
-  });
+    LLVM_DEBUG({
+      llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr()
+                   << "\n";
+      llvm::dbgs() << *serializedISA << "\n";
+      llvm::dbgs().flush();
+    });
 #undef DEBUG_TYPE
 
-  // Return SPIRV if the compilation target is `assembly`.
-  if (targetOptions.getCompilationTarget() ==
-      gpu::CompilationTarget::Assembly) {
     // Make sure to include the null terminator.
     StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
     return SmallVector<char, 0>(bin.begin(), bin.end());
   }
 
-
   std::optional<std::string> serializedSPIRVBinary =
       translateToSPIRVBinary(llvmModule, **targetMachine);
   if (!serializedSPIRVBinary) {
@@ -353,8 +353,8 @@ SpirSerializer::compileToBinary(const std::string &asmStr,
   if (!binaryBuffer) {
     emitError(loc) << "Couldn't open the file: `" << binFile->first
                    << "`, error message: " << binaryBuffer.getError().message();
-     return std::nullopt;
-   }
+    return std::nullopt;
+  }
   StringRef bin = (*binaryBuffer)->getBuffer();
   return SmallVector<char, 0>(bin.begin(), bin.end());
 }
@@ -377,8 +377,7 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     return WalkResult::advance();
   });
 
-  SpirSerializer serializer(
-      *module, cast<XeVMTargetAttr>(attribute), options);
+  SpirSerializer serializer(*module, cast<XeVMTargetAttr>(attribute), options);
   serializer.init();
 
 #if !LLVM_HAS_SPIRV_TARGET

>From 6d2e1ee3412bae9c2a2b5a28b910a9bb9b90b153 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 7 Aug 2025 17:02:29 +0000
Subject: [PATCH 8/9] Refactor to prepare for future target triples.

---
 mlir/include/mlir/Target/LLVM/XeVM/Utils.h |  13 +-
 mlir/lib/Target/LLVM/XeVM/Target.cpp       | 341 ++++++++++-----------
 2 files changed, 182 insertions(+), 172 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
index 95764fa99e0fc..ae0836ea89f57 100644
--- a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
@@ -28,19 +28,30 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target,
                          const gpu::TargetOptions &targetOptions = {});
 
-  static void init();
   XeVMTargetAttr getTarget() const;
 
   /// Loads the bitcode files in `librariesToLink`.
   std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
   loadBitcodeFiles(llvm::Module &module) override;
 
+  /// Returns the gpu module being serialized.
+  gpu::GPUModuleOp getGPUModuleOp();
+
+  /// Compiles to native code using `ocloc`.
+  std::optional<SmallVector<char, 0>> compileToBinary(const std::string &asmStr,
+                                                      StringRef inputFormat);
+
 protected:
   XeVMTargetAttr target;
   /// List of LLVM bitcode to link into after translation to LLVM IR.
   /// The attributes can be StringAttr pointing to a file path, or
   /// a Resource blob pointing to the LLVM bitcode in-memory.
   SmallVector<Attribute> librariesToLink;
+
+  /// Returns the path to the tool used for serialization.
+  std::optional<std::string> findTool(StringRef tool);
+
+  gpu::TargetOptions targetOptions;
 };
 } // namespace xevm
 } // namespace mlir
diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
index 8bc934190d0c3..60e69159023e9 100644
--- a/mlir/lib/Target/LLVM/XeVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -85,18 +85,6 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
                            target.getLinkFiles().end());
 }
 
-void SerializeGPUModuleBase::init() {
-  static llvm::once_flag initializeBackendOnce;
-  llvm::call_once(initializeBackendOnce, []() {
-#if LLVM_HAS_SPIRV_TARGET
-    LLVMInitializeSPIRVTarget();
-    LLVMInitializeSPIRVTargetInfo();
-    LLVMInitializeSPIRVTargetMC();
-    LLVMInitializeSPIRVAsmPrinter();
-#endif
-  });
-}
-
 XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }
 
 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
@@ -110,168 +98,32 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-namespace {
-class SpirSerializer : public SerializeGPUModuleBase {
-public:
-  SpirSerializer(Operation &module, XeVMTargetAttr target,
-                 const gpu::TargetOptions &targetOptions)
-      : SerializeGPUModuleBase(module, target, targetOptions),
-        targetOptions(targetOptions) {}
-
-  gpu::GPUModuleOp getOperation();
-
-  /// Serializes the LLVM module to an object format, depending on the
-  /// compilation target selected in target options.
-  std::optional<SmallVector<char, 0>>
-  moduleToObject(llvm::Module &llvmModule) override;
-
-  /// Compiles to native code using `ocloc`.
-  std::optional<SmallVector<char, 0>> compileToBinary(const std::string &asmStr,
-                                                      StringRef inputFormat);
-
-private:
-  std::optional<std::string> findTool(StringRef tool);
-  std::optional<std::string>
-  translateToSPIRVBinary(llvm::Module &llvmModule,
-                         llvm::TargetMachine &targetMachine);
-  gpu::TargetOptions targetOptions;
-};
-} // namespace
-
-gpu::GPUModuleOp SpirSerializer::getOperation() {
+gpu::GPUModuleOp SerializeGPUModuleBase::getGPUModuleOp() {
   return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
 }
 
-std::optional<SmallVector<char, 0>>
-SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
-#define DEBUG_TYPE "serialize-to-llvm"
-  LLVM_DEBUG({
-    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
-                 << "\n";
-    llvm::dbgs() << llvmModule << "\n";
-    llvm::dbgs().flush();
-  });
-#undef DEBUG_TYPE
-
-  // Return LLVM IR if the compilation target is `offload`.
-  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
-    return SerializeGPUModuleBase::moduleToObject(llvmModule);
-
-#if !LLVM_HAS_SPIRV_TARGET
-  getOperation()->emitError("The `SPIRV` target was not built. Please enable "
-                            "it when building LLVM.");
-  return std::nullopt;
-#endif // LLVM_HAS_SPIRV_TARGET
-
-  std::optional<llvm::TargetMachine *> targetMachine =
-      getOrCreateTargetMachine();
-  if (!targetMachine) {
-    getOperation().emitError() << "Target Machine unavailable for triple "
-                               << triple << ", can't optimize with LLVM\n";
-    return std::nullopt;
-  }
-
-  // Return SPIRV if the compilation target is `assembly`.
-  if (targetOptions.getCompilationTarget() ==
-      gpu::CompilationTarget::Assembly) {
-    std::optional<std::string> serializedISA =
-        translateToISA(llvmModule, **targetMachine);
-    if (!serializedISA) {
-      getOperation().emitError() << "Failed translating the module to ISA."
-                                 << triple << ", can't compile with LLVM\n";
-      return std::nullopt;
-    }
-
-#define DEBUG_TYPE "serialize-to-isa"
-    LLVM_DEBUG({
-      llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr()
-                   << "\n";
-      llvm::dbgs() << *serializedISA << "\n";
-      llvm::dbgs().flush();
-    });
-#undef DEBUG_TYPE
-
-    // Make sure to include the null terminator.
-    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
-    return SmallVector<char, 0>(bin.begin(), bin.end());
-  }
-
-  std::optional<std::string> serializedSPIRVBinary =
-      translateToSPIRVBinary(llvmModule, **targetMachine);
-  if (!serializedSPIRVBinary) {
-    getOperation().emitError() << "Failed translating the module to Binary.";
-    return std::nullopt;
-  }
-  if (serializedSPIRVBinary->size() % 4) {
-    getOperation().emitError() << "SPIRV code size must be a multiple of 4.";
-    return std::nullopt;
-  }
-  StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
-  return SmallVector<char, 0>(bin.begin(), bin.end());
-}
-
-std::optional<std::string>
-SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
-                                       llvm::TargetMachine &targetMachine) {
-  std::string targetISA;
-  llvm::raw_string_ostream stream(targetISA);
-
-  { // Drop pstream after this to prevent the ISA from being stuck buffering
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegenPasses;
-    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                          llvm::CodeGenFileType::ObjectFile))
-      return std::nullopt;
-
-    codegenPasses.run(llvmModule);
-  }
-  return targetISA;
-}
-
-std::optional<std::string> SpirSerializer::findTool(StringRef tool) {
-  // 1. Check the toolkit path given in the command line.
-  StringRef pathRef = targetOptions.getToolkitPath();
-  SmallVector<char, 256> path;
-  if (!pathRef.empty()) {
-    path.insert(path.begin(), pathRef.begin(), pathRef.end());
-    llvm::sys::path::append(path, "bin", tool);
-    if (llvm::sys::fs::can_execute(path))
-      return StringRef(path.data(), path.size()).str();
-  }
-  // 2. Check PATH.
-  if (std::optional<std::string> toolPath =
-          llvm::sys::Process::FindInEnvPath("PATH", tool))
-    return *toolPath;
-
-  getOperation().emitError()
-      << "Couldn't find the `" << tool
-      << "` binary. Please specify the toolkit "
-         "path via GpuModuleToBinaryPass or add the compiler to $PATH`.";
-  return std::nullopt;
-}
-
-// There is 1 way to finalize SPIR-V to native code: IGC
+// There is 1 way to finalize IL to native code: IGC
 // There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime).
-// - L0 runtime consumes SPIR-V and is external to MLIR codebase (rt wrappers).
+// - L0 runtime consumes IL and is external to MLIR codebase (rt wrappers).
 // - `ocloc` tool can be "queried" from within MLIR.
 std::optional<SmallVector<char, 0>>
-SpirSerializer::compileToBinary(const std::string &asmStr,
-                                StringRef inputFormat) {
+SerializeGPUModuleBase::compileToBinary(const std::string &asmStr,
+                                        StringRef inputFormat) {
   using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
   // Find the `ocloc` tool.
   std::optional<std::string> oclocCompiler = findTool("ocloc");
   if (!oclocCompiler)
     return std::nullopt;
-  Location loc = getOperation().getLoc();
-  std::string basename =
-      llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(),
-                    getTarget().getTriple(), getTarget().getChip());
+  Location loc = getGPUModuleOp().getLoc();
+  std::string basename = llvm::formatv(
+      "mlir-{0}-{1}-{2}", getGPUModuleOp().getNameAttr().getValue(),
+      getTarget().getTriple(), getTarget().getChip());
 
   auto createTemp = [&](StringRef name,
                         StringRef suffix) -> std::optional<TmpFile> {
     llvm::SmallString<128> filePath;
     if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) {
-      getOperation().emitError()
+      getGPUModuleOp().emitError()
           << "Couldn't create the temp file: `" << filePath
           << "`, error message: " << ec.message();
       return std::nullopt;
@@ -315,7 +167,7 @@ SpirSerializer::compileToBinary(const std::string &asmStr,
 #define DEBUG_TYPE "serialize-to-binary"
   LLVM_DEBUG({
     llvm::dbgs() << "Tool invocation for module: "
-                 << getOperation().getNameAttr() << "\n";
+                 << getGPUModuleOp().getNameAttr() << "\n";
     llvm::interleave(oclocArgs, llvm::dbgs(), " ");
     llvm::dbgs() << "\n";
   });
@@ -359,6 +211,147 @@ SpirSerializer::compileToBinary(const std::string &asmStr,
   return SmallVector<char, 0>(bin.begin(), bin.end());
 }
 
+std::optional<std::string> SerializeGPUModuleBase::findTool(StringRef tool) {
+  // 1. Check the toolkit path given in the command line.
+  StringRef pathRef = targetOptions.getToolkitPath();
+  SmallVector<char, 256> path;
+  if (!pathRef.empty()) {
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    llvm::sys::path::append(path, "bin", tool);
+    if (llvm::sys::fs::can_execute(path))
+      return StringRef(path.data(), path.size()).str();
+  }
+  // 2. Check PATH.
+  if (std::optional<std::string> toolPath =
+          llvm::sys::Process::FindInEnvPath("PATH", tool))
+    return *toolPath;
+
+  getGPUModuleOp().emitError()
+      << "Couldn't find the `" << tool
+      << "` binary. Please specify the toolkit "
+         "path via GpuModuleToBinaryPass or add the compiler to $PATH`.";
+  return std::nullopt;
+}
+
+namespace {
+class SpirSerializer : public SerializeGPUModuleBase {
+public:
+  SpirSerializer(Operation &module, XeVMTargetAttr target,
+                 const gpu::TargetOptions &targetOptions)
+      : SerializeGPUModuleBase(module, target, targetOptions) {}
+
+  static void init();
+
+  /// Serializes the LLVM module to an object format, depending on the
+  /// compilation target selected in target options.
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  std::optional<std::string>
+  translateToSPIRVBinary(llvm::Module &llvmModule,
+                         llvm::TargetMachine &targetMachine);
+};
+} // namespace
+
+void SpirSerializer::init() {
+  static llvm::once_flag initializeBackendOnce;
+  llvm::call_once(initializeBackendOnce, []() {
+#if LLVM_HAS_SPIRV_TARGET
+    LLVMInitializeSPIRVTarget();
+    LLVMInitializeSPIRVTargetInfo();
+    LLVMInitializeSPIRVTargetMC();
+    LLVMInitializeSPIRVAsmPrinter();
+#endif
+  });
+}
+
+std::optional<SmallVector<char, 0>>
+SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
+#define DEBUG_TYPE "serialize-to-llvm"
+  LLVM_DEBUG({
+    llvm::dbgs() << "LLVM IR for module: " << getGPUModuleOp().getNameAttr()
+                 << "\n";
+    llvm::dbgs() << llvmModule << "\n";
+    llvm::dbgs().flush();
+  });
+#undef DEBUG_TYPE
+
+  // Return LLVM IR if the compilation target is `offload`.
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
+    return SerializeGPUModuleBase::moduleToObject(llvmModule);
+
+#if !LLVM_HAS_SPIRV_TARGET
+  getGPUModuleOp()->emitError("The `SPIRV` target was not built. Please enable "
+                              "it when building LLVM.");
+  return std::nullopt;
+#endif // LLVM_HAS_SPIRV_TARGET
+
+  std::optional<llvm::TargetMachine *> targetMachine =
+      getOrCreateTargetMachine();
+  if (!targetMachine) {
+    getGPUModuleOp().emitError() << "Target Machine unavailable for triple "
+                                 << triple << ", can't optimize with LLVM\n";
+    return std::nullopt;
+  }
+
+  // Return SPIRV if the compilation target is `assembly`.
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
+    std::optional<std::string> serializedISA =
+        translateToISA(llvmModule, **targetMachine);
+    if (!serializedISA) {
+      getGPUModuleOp().emitError() << "Failed translating the module to ISA."
+                                   << triple << ", can't compile with LLVM\n";
+      return std::nullopt;
+    }
+
+#define DEBUG_TYPE "serialize-to-isa"
+    LLVM_DEBUG({
+      llvm::dbgs() << "SPIR-V for module: " << getGPUModuleOp().getNameAttr()
+                   << "\n";
+      llvm::dbgs() << *serializedISA << "\n";
+      llvm::dbgs().flush();
+    });
+#undef DEBUG_TYPE
+
+    // Make sure to include the null terminator.
+    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+    return SmallVector<char, 0>(bin.begin(), bin.end());
+  }
+
+  std::optional<std::string> serializedSPIRVBinary =
+      translateToSPIRVBinary(llvmModule, **targetMachine);
+  if (!serializedSPIRVBinary) {
+    getGPUModuleOp().emitError() << "Failed translating the module to Binary.";
+    return std::nullopt;
+  }
+  if (serializedSPIRVBinary->size() % 4) {
+    getGPUModuleOp().emitError() << "SPIRV code size must be a multiple of 4.";
+    return std::nullopt;
+  }
+  StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
+  return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
+std::optional<std::string>
+SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CodeGenFileType::ObjectFile))
+      return std::nullopt;
+
+    codegenPasses.run(llvmModule);
+  }
+  return targetISA;
+}
+
 std::optional<SmallVector<char, 0>>
 XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
                                       const gpu::TargetOptions &options) const {
@@ -369,23 +362,29 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("expected to be a gpu.module op");
     return std::nullopt;
   }
-  gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
-    if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
-      funcOp.setIntelReqdSubGroupSize(16);
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
+  auto target = cast<XeVMTargetAttr>(attribute);
+  if (target.getTriple().starts_with("spir")) {
+    gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
+      if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
+        funcOp.setIntelReqdSubGroupSize(16);
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
 
-  SpirSerializer serializer(*module, cast<XeVMTargetAttr>(attribute), options);
-  serializer.init();
+    SpirSerializer serializer(*module, cast<XeVMTargetAttr>(attribute),
+                              options);
+    serializer.init();
 
 #if !LLVM_HAS_SPIRV_TARGET
-  module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
-                    "without having the target built.");
+    module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
+                      "without having the target built.");
 #endif
 
-  return serializer.run();
+    return serializer.run();
+  }
+  module->emitError("Unsupported XeVM target triple: ") << target.getTriple();
+  return std::nullopt;
 }
 
 Attribute

>From b102c1b768d733ff165a6085d1025e4f5ad70bc1 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 7 Aug 2025 17:10:36 +0000
Subject: [PATCH 9/9] Cleanup CMake file.

---
 mlir/lib/Target/LLVM/CMakeLists.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 11b69f530a674..d3a8f3c02599f 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -222,9 +222,6 @@ add_mlir_dialect_library(MLIRXeVMTarget
 
   OBJECT
 
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
-
   LINK_COMPONENTS
   ${SPIRV_LIBS}
 
@@ -236,9 +233,3 @@ add_mlir_dialect_library(MLIRXeVMTarget
   MLIRTargetLLVM
   MLIRXeVMToLLVMIRTranslation
 )
-
-# Ensure SPIRV headers are included. Warning: references build directory!
-target_include_directories(MLIRXeVMTarget PRIVATE
-  ${LLVM_MAIN_SRC_DIR}/lib/Target/SPIRV
-  ${LLVM_BINARY_DIR}/lib/Target/SPIRV
-)