[Mlir-commits] [mlir] Add XeVM target and XeVM dialect integration tests. (PR #148286)
Sang Ik Lee
llvmlistbot at llvm.org
Fri Jul 11 13:16:58 PDT 2025
https://github.com/silee2 created https://github.com/llvm/llvm-project/pull/148286
Covers remaining parts required for XeVM dialect intgration testing. It has two high level components
- XeVM target and serialization support
- XeVM dialect integration tests using SYCL runtime
>From e6bcef6f83d6f079c8dab5ad9b3a355a5fb445c4 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 11 Jul 2025 20:13:51 +0000
Subject: [PATCH] Add XeVM target and XeVM dialect integration tests. Covers
remaining parts required for XeVM dialect intgration testing. It has two high
level components - XeVM target and serialization support - XeVM dialect
integration tests using SYCL runtime
---
mlir/CMakeLists.txt | 8 +
mlir/include/mlir/InitAllDialects.h | 2 +
mlir/include/mlir/Target/LLVM/XeVM/Target.h | 31 +++
mlir/include/mlir/Target/LLVM/XeVM/Utils.h | 39 +++
mlir/include/mlir/Target/LLVMIR/Dialect/All.h | 3 +
.../Dialect/XeVM/XeVMToLLVMIRTranslation.h | 33 +++
.../GPU/Transforms/XeVMAttachTarget.cpp | 1 +
mlir/lib/Target/LLVM/CMakeLists.txt | 32 +++
mlir/lib/Target/LLVM/XeVM/Target.cpp | 257 ++++++++++++++++++
mlir/lib/Target/LLVMIR/CMakeLists.txt | 1 +
mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt | 1 +
.../Target/LLVMIR/Dialect/XeVM/CMakeLists.txt | 21 ++
.../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp | 117 ++++++++
.../Dialect/XeVM/GPU/lit.local.cfg | 4 +
.../Dialect/XeVM/GPU/xevm_block_dpas.mlir | 135 +++++++++
.../XeVM/GPU/xevm_block_load_store.mlir | 103 +++++++
.../xevm_block_load_store_pack_register.mlir | 119 ++++++++
.../GPU/xevm_block_load_store_transpose.mlir | 127 +++++++++
.../Dialect/XeVM/GPU/xevm_store_cst.mlir | 74 +++++
mlir/test/lib/Dialect/GPU/CMakeLists.txt | 1 +
mlir/test/lit.site.cfg.py.in | 1 +
21 files changed, 1110 insertions(+)
create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Target.h
create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Utils.h
create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
create mode 100644 mlir/lib/Target/LLVM/XeVM/Target.cpp
create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index a1ad81f625cd6..7c9d62051f9f8 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -137,6 +137,14 @@ else()
set(MLIR_ENABLE_ROCM_CONVERSIONS 0)
endif()
+# Build the XeVM conversions and run according tests if the SPIRV backend
+# is available.
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+ set(MLIR_ENABLE_XEVM_CONVERSIONS 1)
+else()
+ set(MLIR_ENABLE_XEVM_CONVERSIONS 0)
+endif()
+
set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index c6fcf1a0d510b..79dcafe69f0a5 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -102,6 +102,7 @@
#include "mlir/Interfaces/CastInterfaces.h"
#include "mlir/Target/LLVM/NVVM/Target.h"
#include "mlir/Target/LLVM/ROCDL/Target.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
#include "mlir/Target/SPIRV/Target.h"
namespace mlir {
@@ -200,6 +201,7 @@ inline void registerAllDialects(DialectRegistry ®istry) {
NVVM::registerNVVMTargetInterfaceExternalModels(registry);
ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
spirv::registerSPIRVTargetInterfaceExternalModels(registry);
+ xevm::registerXeVMTargetInterfaceExternalModels(registry);
}
/// Append all the MLIR dialects to the registry contained in the given context.
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Target.h b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
new file mode 100644
index 0000000000000..31a93d0ebabfc
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
@@ -0,0 +1,31 @@
+//===-- Target.h - MLIR XeVM target registration ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for attaching the XeVM target interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_XEVM_TARGET_H
+#define MLIR_TARGET_XEVM_TARGET_H
+
+namespace mlir {
+class DialectRegistry;
+class MLIRContext;
+} // namespace mlir
+
+namespace mlir::xevm {
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the given registry.
+void registerXeVMTargetInterfaceExternalModels(mlir::DialectRegistry ®istry);
+
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the registry associated with the given context.
+void registerXeVMTargetInterfaceExternalModels(mlir::MLIRContext &context);
+} // namespace mlir::xevm
+
+#endif // MLIR_TARGET_XEVM_TARGET_H
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
new file mode 100644
index 0000000000000..c11a97f0d960a
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
@@ -0,0 +1,39 @@
+//===-- Utils.h - MLIR XeVM target utils ------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files declares XeVM target related utility classes and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_XEVM_UTILS_H
+#define MLIR_TARGET_LLVM_XEVM_UTILS_H
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/Target/LLVM/ModuleToObject.h"
+
+namespace mlir {
+namespace xevm {
+
+/// Base class for all XeVM serializations from GPU modules into binary strings.
+/// By default this class serializes into LLVM bitcode.
+class SerializeGPUModuleBase : public mlir::LLVM::ModuleToObject {
+public:
+ SerializeGPUModuleBase(mlir::Operation &module, XeVMTargetAttr target,
+ const mlir::gpu::TargetOptions &targetOptions = {});
+
+ static void init();
+ XeVMTargetAttr getTarget() const;
+
+protected:
+ XeVMTargetAttr target;
+};
+} // namespace xevm
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_XEVM_UTILS_H
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 60615cf601655..e4670cb1a9622 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -28,6 +28,7 @@
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
namespace mlir {
class DialectRegistry;
@@ -47,6 +48,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry ®istry) {
registerROCDLDialectTranslation(registry);
registerSPIRVDialectTranslation(registry);
registerVCIXDialectTranslation(registry);
+ registerXeVMDialectTranslation(registry);
// Extension required for translating GPU offloading Ops.
gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
@@ -63,6 +65,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry ®istry) {
registerNVVMDialectTranslation(registry);
registerROCDLDialectTranslation(registry);
registerSPIRVDialectTranslation(registry);
+ registerXeVMDialectTranslation(registry);
// Extension required for translating GPU offloading Ops.
gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
new file mode 100644
index 0000000000000..149a2119657d5
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h
@@ -0,0 +1,33 @@
+//===-- XeVMToLLVMIRTranslation.h - XeVM to LLVM IR -------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for XeVM dialect to LLVM IR translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+#define MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
+
+namespace mlir {
+
+class DialectRegistry;
+class MLIRContext;
+} // namespace mlir
+
+namespace mlir {
+/// Register the XeVM dialect and the translation from it to the LLVM IR in the
+/// given registry;
+void registerXeVMDialectTranslation(mlir::DialectRegistry ®istry);
+
+/// Register the XeVM dialect and the translation from it in the registry
+/// associated with the given context.
+void registerXeVMDialectTranslation(mlir::MLIRContext &context);
+
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H
diff --git a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
index e9cf4939a13b8..6da76e9e7a331 100644
--- a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
@@ -17,6 +17,7 @@
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
#include "llvm/Support/Regex.h"
namespace mlir {
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 83fbf7a5fe5f3..ed15c5d2ab2ca 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -209,3 +209,35 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
)
endif()
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+ set(SPIRV_LIBS
+ SPIRVCodeGen
+
+ )
+endif()
+
+add_mlir_dialect_library(MLIRXeVMTarget
+ XeVM/Target.cpp
+
+ OBJECT
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
+
+ LINK_COMPONENTS
+ ${SPIRV_LIBS}
+
+ LINK_LIBS PUBLIC
+ MLIRIR
+ MLIRExecutionEngineUtils
+ MLIRSupport
+ MLIRGPUDialect
+ MLIRTargetLLVM
+ MLIRXeVMToLLVMIRTranslation
+)
+
+# Ensure SPIRV headers are included. Warning: references build directory!
+target_include_directories(MLIRXeVMTarget PRIVATE
+ ${LLVM_MAIN_SRC_DIR}/lib/Target/SPIRV
+ ${LLVM_BINARY_DIR}/lib/Target/SPIRV
+)
diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
new file mode 100644
index 0000000000000..380e2bff222ca
--- /dev/null
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -0,0 +1,257 @@
+//===-- Target.cpp - MLIR LLVM XeVM target compilation ----------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines XeVM target related functions including registration
+// calls for the `#xevm.target` compilation attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/XeVM/Target.h"
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/ExtensibleDialect.h"
+#include "mlir/Target/LLVM/XeVM/Utils.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+
+#include "llvm/Config/Targets.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+// FIXME: One of the headers uses `.inc` file from the build directory, this
+// does not work for installation (i.e., DCMAKE_INSTALL_PREFIX) caching as build
+// directory will not be cached. Since float atomics are not yet supported by
+// the backend anyway, we can afford to temporarily comment this section.
+
+// #if LLVM_HAS_SPIRV_TARGET
+// #pragma GCC diagnostic push
+// #pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+// #include "SPIRVTargetMachine.h"
+// #pragma GCC diagnostic pop
+
+// #include "SPIRVCommandLine.h"
+// #endif // LLVM_HAS_SPIRV_TARGET
+
+#include <set>
+
+using namespace mlir;
+
+namespace {
+// XeVM implementation of the gpu:TargetAttrInterface.
+class XeVMTargetAttrImpl
+ : public gpu::TargetAttrInterface::FallbackModel<XeVMTargetAttrImpl> {
+public:
+ std::optional<SmallVector<char, 0>>
+ serializeToObject(Attribute attribute, Operation *module,
+ const gpu::TargetOptions &options) const;
+
+ Attribute createObject(Attribute attribute, Operation *module,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const;
+};
+} // namespace
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+ DialectRegistry ®istry) {
+ registry.addExtension(
+ +[](MLIRContext *ctx, mlir::xevm::XeVMDialect *dialect) {
+ mlir::xevm::XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
+ });
+}
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+ MLIRContext &context) {
+ DialectRegistry registry;
+ registerXeVMTargetInterfaceExternalModels(registry);
+ context.appendDialectRegistry(registry);
+}
+
+mlir::xevm::SerializeGPUModuleBase::SerializeGPUModuleBase(
+ Operation &module, mlir::xevm::XeVMTargetAttr target,
+ const gpu::TargetOptions &targetOptions)
+ : ModuleToObject(module, target.getTriple(), "", {}, target.getO()),
+ target(target) {}
+
+void mlir::xevm::SerializeGPUModuleBase::init() {
+ static llvm::once_flag initializeBackendOnce;
+ llvm::call_once(initializeBackendOnce, []() {
+#if LLVM_HAS_SPIRV_TARGET
+ LLVMInitializeSPIRVTarget();
+ LLVMInitializeSPIRVTargetInfo();
+ LLVMInitializeSPIRVTargetMC();
+ LLVMInitializeSPIRVAsmPrinter();
+#endif
+ });
+}
+
+mlir::xevm::XeVMTargetAttr
+mlir::xevm::SerializeGPUModuleBase::getTarget() const {
+ return target;
+}
+
+namespace {
+class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase {
+public:
+ SpirSerializer(Operation &module, mlir::xevm::XeVMTargetAttr target,
+ const gpu::TargetOptions &targetOptions)
+ : mlir::xevm::SerializeGPUModuleBase(module, target, targetOptions) {}
+
+ gpu::GPUModuleOp getOperation();
+
+ std::optional<SmallVector<char, 0>>
+ moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+ std::optional<std::string>
+ translateToSPIRVBinary(llvm::Module &llvmModule,
+ llvm::TargetMachine &targetMachine);
+ gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+gpu::GPUModuleOp SpirSerializer::getOperation() {
+ return dyn_cast<gpu::GPUModuleOp>(
+ &mlir::xevm::SerializeGPUModuleBase::getOperation());
+}
+
+std::optional<SmallVector<char, 0>>
+SpirSerializer::moduleToObject(llvm::Module &llvmModule) {
+ // Return LLVM IR if the compilation target is `offload`.
+ if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
+ return mlir::xevm::SerializeGPUModuleBase::moduleToObject(llvmModule);
+
+#if !LLVM_HAS_SPIRV_TARGET
+ getOperation()->emitError(
+ "The `SPIRV` target was not built. Please enable it when building LLVM.");
+ return std::nullopt;
+#endif // LLVM_HAS_SPIRV_TARGET
+
+ std::optional<llvm::TargetMachine *> targetMachine =
+ getOrCreateTargetMachine();
+ if (!targetMachine) {
+ getOperation().emitError() << "Target Machine unavailable for triple "
+ << triple << ", can't compile with LLVM\n";
+ return std::nullopt;
+ }
+
+ //===----------------------------------------------------------------------===//
+ // Workaround to enable spirv extensions that are not added to target machine
+ // by default.
+
+ // FIXME: see fixme comment above SPIRV headers.
+ // #if LLVM_HAS_SPIRV_TARGET
+ // std::set<llvm::SPIRV::Extension::Extension> AllowedExtIds{
+ // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add,
+ // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add};
+ // llvm::SPIRVTargetMachine *STM =
+ // static_cast<llvm::SPIRVTargetMachine *>(targetMachine.value());
+ // const_cast<llvm::SPIRVSubtarget *>(STM->getSubtargetImpl())
+ // ->initAvailableExtensions(AllowedExtIds);
+ // #endif // LLVM_HAS_SPIRV_TARGET
+
+ //===----------------------------------------------------------------------===//
+
+ // Return SPIRV if the compilation target is `assembly`.
+ if (targetOptions.getCompilationTarget() ==
+ gpu::CompilationTarget::Assembly) {
+ std::optional<std::string> serializedISA =
+ translateToISA(llvmModule, **targetMachine);
+ if (!serializedISA) {
+ getOperation().emitError() << "Failed translating the module to ISA.";
+ return std::nullopt;
+ }
+ // Make sure to include the null terminator.
+ StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+ return SmallVector<char, 0>(bin.begin(), bin.end());
+ }
+
+ std::optional<std::string> serializedSPIRVBinary =
+ translateToSPIRVBinary(llvmModule, **targetMachine);
+ if (!serializedSPIRVBinary) {
+ getOperation().emitError() << "Failed translating the module to Binary.";
+ return std::nullopt;
+ }
+ if (serializedSPIRVBinary->size() % 4) {
+ getOperation().emitError() << "SPIRV code size must be a multiple of 4.";
+ return std::nullopt;
+ }
+ StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
+ return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
+std::optional<std::string>
+SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
+ llvm::TargetMachine &targetMachine) {
+ std::string targetISA;
+ llvm::raw_string_ostream stream(targetISA);
+
+ { // Drop pstream after this to prevent the ISA from being stuck buffering
+ llvm::buffer_ostream pstream(stream);
+ llvm::legacy::PassManager codegenPasses;
+
+ if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+ llvm::CodeGenFileType::ObjectFile))
+ return std::nullopt;
+
+ codegenPasses.run(llvmModule);
+ }
+ return targetISA;
+}
+
+std::optional<SmallVector<char, 0>>
+XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
+ const gpu::TargetOptions &options) const {
+ if (!module)
+ return std::nullopt;
+ auto gpuMod = dyn_cast<gpu::GPUModuleOp>(module);
+ if (!gpuMod) {
+ module->emitError("expected to be a gpu.module op");
+ return std::nullopt;
+ }
+ gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
+ if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
+ funcOp.setIntelReqdSubGroupSize(16);
+ return WalkResult::interrupt();
+ }
+ return WalkResult::advance();
+ });
+
+ SpirSerializer serializer(
+ *module, cast<mlir::xevm::XeVMTargetAttr>(attribute), options);
+ serializer.init();
+
+#if !LLVM_HAS_SPIRV_TARGET
+ module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
+ "without having the target built.");
+#endif
+
+ return serializer.run();
+}
+
+Attribute
+XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
+ const SmallVector<char, 0> &object,
+ const gpu::TargetOptions &options) const {
+ gpu::CompilationTarget format = options.getCompilationTarget();
+ DictionaryAttr objectProps;
+ Builder builder(attribute.getContext());
+ return builder.getAttr<gpu::ObjectAttr>(
+ attribute, format,
+ builder.getStringAttr(StringRef(object.data(), object.size())),
+ objectProps, /*kernels=*/nullptr);
+}
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
index af22a7ff04bf0..9ea5c6835e8ef 100644
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -60,6 +60,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
MLIRROCDLToLLVMIRTranslation
MLIRSPIRVToLLVMIRTranslation
MLIRVCIXToLLVMIRTranslation
+ MLIRXeVMToLLVMIRTranslation
)
add_mlir_translation_library(MLIRTargetLLVMIRImport
diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
index f030fa78942d5..86c731a1074c3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(OpenMP)
add_subdirectory(ROCDL)
add_subdirectory(SPIRV)
add_subdirectory(VCIX)
+add_subdirectory(XeVM)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
new file mode 100644
index 0000000000000..6308d7e2e4404
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_OPTIONAL_SOURCES
+ XeVMToLLVMIRTranslation.cpp
+)
+
+add_mlir_translation_library(MLIRXeVMToLLVMIRTranslation
+ XeVMToLLVMIRTranslation.cpp
+
+ DEPENDS
+ MLIRXeVMConversionsIncGen
+
+ LINK_COMPONENTS
+ Core
+
+ LINK_LIBS PUBLIC
+ MLIRDialectUtils
+ MLIRIR
+ MLIRLLVMDialect
+ MLIRXeVMDialect
+ MLIRSupport
+ MLIRTargetLLVMIRExport
+)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
new file mode 100644
index 0000000000000..f961d25ff1b86
--- /dev/null
+++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp
@@ -0,0 +1,117 @@
+//===-- XeVMToLLVMIRTranslation.cpp - Translate XeVM to LLVM IR -*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a translation between the MLIR XeVM dialect and
+// LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+namespace {
+/// Implementation of the dialect interface that converts operations belonging
+/// to the XeVM dialect to LLVM IR.
+class XeVMDialectLLVMIRTranslationInterface
+ : public LLVMTranslationDialectInterface {
+public:
+ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
+
+ /// Translates the given operation to LLVM IR using the provided IR builder
+ /// and saving the state in `moduleTranslation`.
+ LogicalResult
+ convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) const final {
+ /* TODO */
+ return failure();
+ }
+
+ /// Attaches module-level metadata for functions marked as kernels.
+ LogicalResult
+ amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
+ NamedAttribute attribute,
+ LLVM::ModuleTranslation &moduleTranslation) const final {
+ StringRef attrName = attribute.getName().getValue();
+ if (attrName == mlir::xevm::XeVMDialect::getCacheControlsAttrName()) {
+ auto cacheControlsArray = dyn_cast<ArrayAttr>(attribute.getValue());
+ if (cacheControlsArray.size() != 2) {
+ return op->emitOpError(
+ "Expected both L1 and L3 cache control attributes!");
+ }
+ if (instructions.size() != 1) {
+ return op->emitOpError("Expecting a single instruction");
+ }
+ return handleDecorationCacheControl(instructions.front(),
+ cacheControlsArray.getValue());
+ }
+ auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+ if (!func)
+ return failure();
+
+ return success();
+ }
+
+private:
+ template <typename IntTy>
+ static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) {
+ return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val));
+ }
+
+ static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst,
+ ArrayRef<Attribute> attrs) {
+ SmallVector<llvm::Metadata *> decorations;
+ llvm::LLVMContext &ctx = inst->getContext();
+ llvm::Type *i32Ty = llvm::IntegerType::getInt32Ty(ctx);
+ llvm::transform(attrs, std::back_inserter(decorations),
+ [&ctx, i32Ty](Attribute attr) -> llvm::Metadata * {
+ auto valuesArray = dyn_cast<ArrayAttr>(attr).getValue();
+ std::array<llvm::Metadata *, 4> metadata;
+ llvm::transform(
+ valuesArray, metadata.begin(),
+ [i32Ty](Attribute valueAttr) {
+ return getConstantIntMD(
+ i32Ty, cast<IntegerAttr>(valueAttr).getValue());
+ });
+ return llvm::MDNode::get(ctx, metadata);
+ });
+ constexpr llvm::StringLiteral decorationCacheControlMDName =
+ "spirv.DecorationCacheControlINTEL";
+ inst->setMetadata(decorationCacheControlMDName,
+ llvm::MDNode::get(ctx, decorations));
+ return success();
+ }
+};
+} // namespace
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry ®istry) {
+ registry.insert<xevm::XeVMDialect>();
+ registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) {
+ dialect->addInterfaces<XeVMDialectLLVMIRTranslationInterface>();
+ });
+}
+
+void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) {
+ DialectRegistry registry;
+ registerXeVMDialectTranslation(registry);
+ context.appendDialectRegistry(registry);
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
new file mode 100644
index 0000000000000..d172445e6ee54
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+ config.unsupported = True
+if not config.enable_sycl_runner:
+ config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
new file mode 100644
index 0000000000000..07bd15a35083e
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
@@ -0,0 +1,135 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+ gpu.module @kernel {
+ // - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size.
+ // The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics.
+ // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
+
+ gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel {
+ %base_width_a = arith.constant 32 : i32
+ %base_height_a = arith.constant 8 : i32
+ %base_pitch_a = arith.constant 32 : i32
+ %x = arith.constant 0 : i32
+ %y = arith.constant 0 : i32
+ %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+
+ %base_width_b = arith.constant 32 : i32
+ %base_height_b = arith.constant 16 : i32
+ %base_pitch_b = arith.constant 32 : i32
+ %loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+ %loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32>
+
+ %base_width_c = arith.constant 64 : i32
+ %base_height_c = arith.constant 8 : i32
+ %base_pitch_c = arith.constant 64 : i32
+ %loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+
+ %loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32>
+ %c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted {shape=<m=8, n=16, k=16>, types=<d=f32, a=f16, b=f16, c=f32>} : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
+ %c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32>
+
+ xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ gpu.return
+ }
+ }
+
+ func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index
+
+ %memref_a = gpu.alloc() : memref<8x16xf16>
+ gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
+ %a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index
+ %a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
+ %a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
+ %a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ %memref_b = gpu.alloc() : memref<16x16xf16>
+ gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
+ %b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index
+ %b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64
+ %b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr
+ %b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ %memref_c = gpu.alloc() : memref<8x16xf32>
+ gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
+ %c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index
+ %c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64
+ %c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr
+ %c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>)
+ gpu.dealloc %memref_a : memref<8x16xf16>
+ gpu.dealloc %memref_b : memref<16x16xf16>
+ %res = memref.alloc() : memref<8x16xf32>
+ gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32>
+ gpu.dealloc %memref_c : memref<8x16xf32>
+ return %res : memref<8x16xf32>
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %A = memref.alloc() : memref<8x16xf16>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+
+ scf.for %i = %c0 to %c8 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ %row_idx = arith.index_cast %i : index to i32
+ %row = arith.sitofp %row_idx : i32 to f16
+ memref.store %row, %A[%i, %j] : memref<8x16xf16>
+ }
+ }
+ %B = memref.alloc() : memref<16x16xf16>
+ scf.for %i = %c0 to %c16 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ %col_idx = arith.index_cast %j : index to i32
+ %col = arith.sitofp %col_idx : i32 to f16
+ memref.store %col, %B[%i, %j] : memref<16x16xf16>
+ }
+ }
+
+ %C = memref.alloc() : memref<8x16xf32>
+ %c0_f16 = arith.constant 0.0 : f32
+ scf.for %i = %c0 to %c8 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32>
+ }
+ }
+
+ %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32>
+ %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32>
+ %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+ call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ // CHECK-NEXT: [0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]
+ // CHECK-NEXT: [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480]
+ // CHECK-NEXT: [0, 48, 96, 144, 192, 240, 288, 336, 384, 432, 480, 528, 576, 624, 672, 720]
+ // CHECK-NEXT: [0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960]
+ // CHECK-NEXT: [0, 80, 160, 240, 320, 400, 480, 560, 640, 720, 800, 880, 960, 1040, 1120, 1200]
+ // CHECK-NEXT: [0, 96, 192, 288, 384, 480, 576, 672, 768, 864, 960, 1056, 1152, 1248, 1344, 1440]
+ // CHECK-NEXT: [0, 112, 224, 336, 448, 560, 672, 784, 896, 1008, 1120, 1232, 1344, 1456, 1568, 1680]
+
+ memref.dealloc %A : memref<8x16xf16>
+ memref.dealloc %B : memref<16x16xf16>
+ memref.dealloc %C : memref<8x16xf32>
+ memref.dealloc %C_res : memref<8x16xf32>
+ return
+ }
+ func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+ func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
new file mode 100644
index 0000000000000..3efb43bd0e426
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
@@ -0,0 +1,103 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+ gpu.module @kernel {
+ // - `cl_intel_subgroups` block load/store intrinsics operate at the *maximum* sub-group size,
+ // regardless of the active sub-group size. Make sure `clGetKernelSubGroupInfo` meets your expectations.
+ // - The attribute `intel_reqd_sub_group_size` establishes the maximum sub-group size for a kernel.
+ //
+ // Note: launching 16 threads without explicit `intel_reqd_sub_group_size = 16` may still use
+ // the default sub-group size of 32.
+ //
+ // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_required_subgroup_size.html
+ // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html
+
+ gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
+ %base_width = arith.constant 64 : i32 // bytewidth of the block
+ %base_height = arith.constant 8 : i32 // number of rows
+ %base_pitch = arith.constant 64 : i32 // bytewidth of the base row
+ %x = arith.constant 0 : i32
+ %y = arith.constant 0 : i32
+ // If `intel_reqd_sub_group_size = 16` is not set, the default (32) is used and this `blockload2d` would only load 4 elements into vector<8xi32>
+ %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+ %c0 = arith.constant 0 : i32
+ %thread_x = gpu.thread_id x
+ %thread_x_i64 = arith.index_cast %thread_x : index to i64
+ %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+ %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+ %loaded_f32_modified = vector.insertelement %thread_x_f32, %loaded_f32[%c0 : i32] : vector<8xf32>
+ %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+ xevm.blockstore2d %dst, %base_width, %base_height, %base_pitch, %x, %y, %loaded_modified <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ gpu.return
+ }
+ }
+
+ func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+ %memref_src = gpu.alloc() : memref<8x16xf32>
+ gpu.memcpy %memref_src, %src : memref<8x16xf32>, memref<8x16xf32>
+ %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<8x16xf32> -> index
+ %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+ %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+ %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ %memref_dst = gpu.alloc() : memref<8x16xf32>
+ %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+ %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+ %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+ %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+ gpu.dealloc %memref_src : memref<8x16xf32>
+ %dst = memref.alloc() : memref<8x16xf32>
+ gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+ gpu.dealloc %memref_dst : memref<8x16xf32>
+ return %dst : memref<8x16xf32>
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %A = memref.alloc() : memref<8x16xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c11_f32 = arith.constant 11.11 : f32
+ scf.for %i = %c0 to %c8 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+ }
+ }
+ %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+ %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+ %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+ call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+ call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [11.11{{.*}}]
+ // CHECK-COUNT-96: 11.11
+ // CHECK-NEXT: [11.11{{.*}}]
+
+ // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // CHECK-COUNT-96: 11.11
+ // CHECK-NEXT: [11.11{{.*}}]
+
+ memref.dealloc %A : memref<8x16xf32>
+ memref.dealloc %B : memref<8x16xf32>
+ return
+ }
+ func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
new file mode 100644
index 0000000000000..88d2e90794fb0
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+ gpu.module @kernel {
+ gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
+ %base_width = arith.constant 32 : i32 // bytewidth of the block
+ %base_height_load = arith.constant 16 : i32 // number of rows
+ %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+ %x = arith.constant 0 : i32
+ %y = arith.constant 0 : i32
+
+ // Consider the following two loads:
+ // Normal load:
+ %loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+ %loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16>
+ %loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+
+ // Register packed load:
+ %loaded_packed = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ %loaded_packed_f16_flat = vector.bitcast %loaded_packed : vector<8xi32> to vector<16xf16>
+ %loaded_packed_f16 = vector.shape_cast %loaded_packed_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+ // Both can be represented the same way in code as vector<16xf16>.
+ // A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register.
+ // Packed load "packs" multiple sub-dword values along the column (↓), allowing a single register to hold multiple values.
+ // In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded by packing to register is logically equivalent to the sequence of values loaded using a normal load.
+ // The load results of both methods can have the same logical representation, but are expected to differ in physical layout and register efficiency.
+
+ %thread_x = gpu.thread_id x
+ %thread_x_i64 = arith.index_cast %thread_x : index to i64
+ %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+ %thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16
+ %loaded_f16_modified = vector.insert %thread_x_f16, %loaded_packed_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_packed_f16 and loaded_f16 can be used here
+ // We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores
+ %loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified {offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+ %loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16>
+ %base_height_store = arith.constant 8 : i32 // number of rows
+ %base_width_store = arith.constant 32 : i32 // bytewidth of the block
+ %base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row
+ xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+
+ %loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified {offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+ %loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16>
+
+ %second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32
+ %second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8
+ xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+ gpu.return
+ }
+ }
+
+
+ func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+ %memref_src = gpu.alloc() : memref<16x16xf16>
+ gpu.memcpy %memref_src, %src : memref<16x16xf16>, memref<16x16xf16>
+ %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index
+ %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+ %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+ %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ %memref_dst = gpu.alloc() : memref<16x16xf16>
+ %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index
+ %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+ %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+ %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+ gpu.dealloc %memref_src : memref<16x16xf16>
+ %dst = memref.alloc() : memref<16x16xf16>
+ gpu.memcpy %dst, %memref_dst : memref<16x16xf16>, memref<16x16xf16>
+ gpu.dealloc %memref_dst : memref<16x16xf16>
+ return %dst : memref<16x16xf16>
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %A = memref.alloc() : memref<16x16xf16>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 16 : index
+ %c16 = arith.constant 16 : index
+ %c11_f32 = arith.constant 11.1 : f16
+ scf.for %i = %c0 to %c8 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16>
+ }
+ }
+ %B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16>
+ %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+ %A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16>
+ call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
+ call @printMemrefF16(%B_cast) : (memref<*xf16>) -> ()
+
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [11.1{{.*}}]
+ // CHECK-COUNT-224: 11.1
+ // CHECK-NEXT: [11.1{{.*}}]
+
+ // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [11.1{{.*}}]
+ // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // CHECK-COUNT-208: 11.1
+ // CHECK-NEXT: [11.1{{.*}}]
+
+ memref.dealloc %A : memref<16x16xf16>
+ memref.dealloc %B : memref<16x16xf16>
+ return
+ }
+ func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
new file mode 100644
index 0000000000000..646d1fc9b4666
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+ gpu.module @kernel {
+ gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
+ %base_width = arith.constant 32 : i32 // bytewidth of the block
+ %base_height = arith.constant 16 : i32 // number of rows
+ %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+ %x = arith.constant 0 : i32
+ %y = arith.constant 0 : i32
+ // Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item loads a horizontal slice (→).
+ // The tile dimension we want to slice must be a multiple of the sub-group size: e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0.
+ %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=true, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+
+ %c0 = arith.constant 0 : i32
+ %thread_x = gpu.thread_id x
+ %thread_x_i64 = arith.index_cast %thread_x : index to i64
+ %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+ %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+ %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored
+ %loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+
+ %base_height_store = arith.constant 8 : i32 // number of rows
+ %base_width_store = arith.constant 64 : i32 // bytewidth of the block
+ %base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row
+ // "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓).
+ xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ gpu.return
+ }
+ }
+
+
+ func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+ %memref_src = gpu.alloc() : memref<16x8xf32>
+ gpu.memcpy %memref_src, %src : memref<16x8xf32>, memref<16x8xf32>
+ %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index
+ %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+ %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+ %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ %memref_dst = gpu.alloc() : memref<8x16xf32>
+ %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+ %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+ %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+ %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+ gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+ gpu.dealloc %memref_src : memref<16x8xf32>
+ %dst = memref.alloc() : memref<8x16xf32>
+ gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+ gpu.dealloc %memref_dst : memref<8x16xf32>
+ return %dst : memref<8x16xf32>
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %A = memref.alloc() : memref<16x8xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c11_f32 = arith.constant 11.11 : f16
+ scf.for %i = %c0 to %c16 step %c1 {
+ scf.for %j = %c0 to %c8 step %c1 {
+ %c_10_f = arith.constant 10.0 : f32
+ %j_i64 = arith.index_cast %j : index to i64
+ %j_i32 = llvm.trunc %j_i64 : i64 to i32
+ %j_f32 = arith.sitofp %j_i32 : i32 to f32
+ %jj = arith.divf %j_f32, %c_10_f : f32
+
+ %i_i64 = arith.index_cast %i : index to i64
+ %i_i32 = llvm.trunc %i_i64 : i64 to i32
+ %i_f32 = arith.sitofp %i_i32 : i32 to f32
+ %ii = arith.addf %i_f32, %jj : f32
+ memref.store %ii, %A[%i, %j] : memref<16x8xf32>
+ }
+ }
+ %B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32>
+ %A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32>
+ %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+ call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
+ // CHECK-NEXT: [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
+ // CHECK-NEXT: [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7],
+ // CHECK-NEXT: [3, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7],
+ // CHECK-NEXT: [4, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7],
+ // CHECK-NEXT: [5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7],
+ // CHECK-NEXT: [6, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7],
+ // CHECK-NEXT: [7, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7],
+ // CHECK-NEXT: [8, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7],
+ // CHECK-NEXT: [9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7],
+ // CHECK-NEXT: [10, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7],
+ // CHECK-NEXT: [11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7],
+ // CHECK-NEXT: [12, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7],
+ // CHECK-NEXT: [13, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7],
+ // CHECK-NEXT: [14, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7],
+ // CHECK-NEXT: [15, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7]
+
+ call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ // CHECK-NEXT: [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1],
+ // CHECK-NEXT: [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2, 12.2, 13.2, 14.2, 15.2],
+ // CHECK-NEXT: [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 9.3, 10.3, 11.3, 12.3, 13.3, 14.3, 15.3],
+ // CHECK-NEXT: [0.4, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4, 9.4, 10.4, 11.4, 12.4, 13.4, 14.4, 15.4],
+ // CHECK-NEXT: [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5],
+ // CHECK-NEXT: [0.6, 1.6, 2.6, 3.6, 4.6, 5.6, 6.6, 7.6, 8.6, 9.6, 10.6, 11.6, 12.6, 13.6, 14.6, 15.6],
+ // CHECK-NEXT: [0.7, 1.7, 2.7, 3.7, 4.7, 5.7, 6.7, 7.7, 8.7, 9.7, 10.7, 11.7, 12.7, 13.7, 14.7, 15.7]
+
+ memref.dealloc %A : memref<16x8xf32>
+ memref.dealloc %B : memref<8x16xf32>
+ return
+ }
+ func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
new file mode 100644
index 0000000000000..7ead3577857f5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_sycl_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --shared-libs=%mlir_c_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+ gpu.module @kernel {
+ gpu.func @store_constant(%ptr: !llvm.ptr<1>) kernel {
+ %const_val = arith.constant 42.0 : f32
+ %thread_x = gpu.lane_id
+ %thread_x_i64 = arith.index_cast %thread_x : index to i64
+ %ptr_next_1 = llvm.getelementptr %ptr[%thread_x_i64] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, i32
+ llvm.store %const_val, %ptr_next_1 : f32, !llvm.ptr<1>
+ gpu.return
+ }
+ }
+ func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index
+ %memref_0 = gpu.alloc() : memref<8x16xf32>
+ gpu.memcpy %memref_0, %src : memref<8x16xf32>, memref<8x16xf32>
+ %0 = memref.extract_aligned_pointer_as_index %memref_0 : memref<8x16xf32> -> index
+ %1 = arith.index_cast %0 : index to i64
+ %2 = llvm.inttoptr %1 : i64 to !llvm.ptr
+ %src_casted = llvm.addrspacecast %2 : !llvm.ptr to !llvm.ptr<1>
+ gpu.launch_func @kernel::@store_constant blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_casted : !llvm.ptr<1>)
+ %dst = memref.alloc() : memref<8x16xf32>
+ gpu.memcpy %dst, %memref_0 : memref<8x16xf32>, memref<8x16xf32>
+ gpu.dealloc %memref_0 : memref<8x16xf32>
+
+ return %dst : memref<8x16xf32>
+ }
+
+ func.func @main() attributes {llvm.emit_c_interface} {
+ %A = memref.alloc() : memref<8x16xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c11_f32 = arith.constant 11.11 : f32
+ scf.for %i = %c0 to %c8 step %c1 {
+ scf.for %j = %c0 to %c16 step %c1 {
+ memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+ }
+ }
+ %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+ %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+ %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+ call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+ call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+ // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [11.11{{.*}}]
+ // CHECK-COUNT-96: 11.11
+ // CHECK-NEXT: [11.11{{.*}}]
+
+ // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+ // CHECK-NEXT: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]
+ // CHECK-COUNT-96: 11.11
+ // CHECK-NEXT: [11.11{{.*}}]
+
+ memref.dealloc %A : memref<8x16xf32>
+ memref.dealloc %B : memref<8x16xf32>
+ return
+ }
+ func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
index 418c884dc03b3..882d5abc2eeb8 100644
--- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
@@ -30,6 +30,7 @@ set(LIBS
MLIRVectorDialect
MLIRVectorToLLVMPass
MLIRXeVMDialect
+ MLIRXeVMToLLVMIRTranslation
)
add_mlir_library(MLIRGPUTestPasses
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 132aabe135940..08dc98938a31a 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -32,6 +32,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.run_xevm_tests = @MLIR_ENABLE_XEVM_CONVERSIONS@
config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
More information about the Mlir-commits
mailing list