[Mlir-commits] [mlir] [MLIR] Add XeGPU dialect for Intel GPU (PR #78483)
Chao Chen
llvmlistbot at llvm.org
Wed Feb 21 13:12:38 PST 2024
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/78483
>From 35440b9b0751dec934049aed9257ae2bbcfabe13 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 17 Jan 2024 17:45:24 +0000
Subject: [PATCH 1/4] add XeGPU dialect definition
---
mlir/include/mlir/Dialect/CMakeLists.txt | 1 +
.../include/mlir/Dialect/XeGPU/CMakeLists.txt | 1 +
.../mlir/Dialect/XeGPU/IR/CMakeLists.txt | 14 +
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 52 +
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.td | 14 +
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 150 ++
.../mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 46 +
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 505 +++++
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 170 ++
mlir/include/mlir/InitAllDialects.h | 4 +-
mlir/lib/Dialect/CMakeLists.txt | 1 +
mlir/lib/Dialect/XeGPU/CMakeLists.txt | 1 +
mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt | 15 +
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 385 ++++
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 1929 +++++++++++++++++
mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir | 110 +
mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir | 43 +
mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir | 38 +
mlir/test/Dialect/XeGPU/IR/barrier_ops.mlir | 54 +
.../Dialect/XeGPU/IR/create_nd_tdesc.mlir | 111 +
.../Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir | 115 +
mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir | 11 +
.../Dialect/XeGPU/IR/create_tdesc_vc.mlir | 51 +
mlir/test/Dialect/XeGPU/IR/invalid_vc.mlir | 70 +
.../test/Dialect/XeGPU/IR/load_gather_vc.mlir | 50 +
mlir/test/Dialect/XeGPU/IR/load_nd.mlir | 164 ++
mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir | 69 +
.../test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir | 62 +
mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir | 71 +
.../test/Dialect/XeGPU/IR/simple_gemm_vc.mlir | 65 +
mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir | 83 +
mlir/test/Dialect/XeGPU/IR/store_scatter.mlir | 29 +
.../Dialect/XeGPU/IR/store_scatter_vc.mlir | 29 +
.../Dialect/XeGPU/IR/update_nd_offset.mlir | 27 +
.../Dialect/XeGPU/IR/update_offset_vc.mlir | 29 +
35 files changed, 4568 insertions(+), 1 deletion(-)
create mode 100644 mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.td
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
create mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
create mode 100644 mlir/lib/Dialect/XeGPU/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
create mode 100644 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
create mode 100644 mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/barrier_ops.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/invalid_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/load_nd.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
create mode 100644 mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
index 1c4569ecfa5848..e0eb421291ded7 100644
--- a/mlir/include/mlir/Dialect/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -39,3 +39,4 @@ add_subdirectory(UB)
add_subdirectory(Utils)
add_subdirectory(Vector)
add_subdirectory(X86Vector)
+add_subdirectory(XeGPU)
diff --git a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
new file mode 100644
index 00000000000000..f33061b2d87cff
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
new file mode 100644
index 00000000000000..f1740e9ed929a6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect(XeGPU xegpu)
+add_mlir_doc(XeGPU XeGPU Dialects/ -gen-dialect-doc -dialect=xegpu)
+
+set(LLVM_TARGET_DEFINITIONS XeGPU.td)
+mlir_tablegen(XeGPUAttrs.h.inc -gen-attrdef-decls)
+mlir_tablegen(XeGPUAttrs.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRXeGPUAttrsIncGen)
+add_dependencies(mlir-headers MLIRXeGPUAttrsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPU.td)
+mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
+mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRXeGPUEnumsIncGen)
+add_dependencies(mlir-headers MLIRXeGPUEnumsIncGen)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
new file mode 100644
index 00000000000000..a05e046a0e0c0b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -0,0 +1,52 @@
+//===- XeGPU.h - MLIR dialect for XeGPU -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
+#define MLIR_DIALECT_XEGPU_IR_XEGPU_H
+
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/Region.h>
+#include <mlir/IR/Types.h>
+#include <mlir/Interfaces/CastInterfaces.h>
+#include <mlir/Interfaces/ControlFlowInterfaces.h>
+#include <mlir/Interfaces/CopyOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/ShapedOpInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Interfaces/ViewLikeInterface.h>
+
+namespace mlir {
+
+/// Return the list of Range (i.e. offset, size, stride). Each Range
+/// entry contains either the dynamic value or a ConstantIndexOp constructed
+/// with `b` at location `loc`.
+SmallVector<Range, 8> getOrCreateRanges(OffsetSizeAndStrideOpInterface op,
+ OpBuilder &b, Location loc);
+
+} // namespace mlir
+
+namespace mlir {
+namespace xegpu {
+
+class TensorDescType;
+
+} // namespace xegpu
+} // namespace mlir
+
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+#define GET_ATTRDEF_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
+#define GET_TYPEDEF_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc>
+#define GET_OP_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPU_H
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.td
new file mode 100644
index 00000000000000..232e962870716c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.td
@@ -0,0 +1,14 @@
+//===- XeGPU.td - XeGPU dialect definition ------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPU_TD
+
+include "mlir/Dialect/XeGPU/IR/XeGPUOps.td"
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPU_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
new file mode 100644
index 00000000000000..ed3d9bbc772567
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -0,0 +1,150 @@
+//===- XeGPUAttrs.td - XeGPU dialect attributes definition --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
+
+include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
+
+class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
+ string baseCppClass = "::mlir::Attribute">
+ : AttrDef<XeGPU_Dialect, name, traits, baseCppClass> {
+ let mnemonic = attrMnemonic;
+}
+
+def XeGPU_ScatteredAttr : XeGPUAttr<"Scattered", "scattered"> {
+ let summary = "Scattered attribute for scattered read and write operation.";
+ let description = [{An attribute represent scattered read and write operation.
+ It does not (need to) have meaningful input values. The existence of itself
+ implies scattered read/write.}];
+
+ let assemblyFormat = "";
+}
+
+def XeGPU_SgMapAttr: XeGPUAttr<"SubGroupMap", "sg_map"> {
+ let parameters = (ins
+ "mlir::DenseI32ArrayAttr":$wi_layout,
+ "mlir::DenseI32ArrayAttr":$wi_data
+ );
+
+ // In format of #xegpu.sg_map<{mma_block_size = [2, 4], wi_layout = [2, 4], wi_data = [2, 4]}>
+ let assemblyFormat = "`<` struct(params) `>`";
+
+ let genVerifyDecl = true;
+
+ let builders = [
+ AttrBuilder<(ins
+ "llvm::ArrayRef<int32_t>":$wiLayout,
+ "llvm::ArrayRef<int32_t>":$wiData
+ )>
+ ];
+}
+
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+ let parameters = (ins
+ DefaultValuedParameter<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">: $memory_scope,
+ DefaultValuedParameter<"int", "1">: $array_length,
+ DefaultValuedParameter<"bool", "true">: $boundary_check,
+ OptionalParameter<"xegpu::ScatteredAttr">: $scattered,
+ OptionalParameter<"xegpu::SubGroupMapAttr"> : $map
+ );
+
+ let builders = [
+ AttrBuilder<(ins
+ CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::GLOBAL">:$memory_scope,
+ CArg<"int", "1">:$array_length,
+ CArg<"xegpu::ScatteredAttr", "{}">:$scattered,
+ CArg<"xegpu::SubGroupMapAttr", "{}">:$map
+ )>
+ ];
+
+ let extraClassDeclaration = [{
+ bool hasNonDefaultAttrs();
+ }];
+
+ let hasCustomAssemblyFormat = true;
+}
+
+def ARG_TYPE_VECTOR : I32EnumAttrCase<"VECTOR", 0, "vector">;
+def ARG_TYPE_SCALAR : I32EnumAttrCase<"SCALAR", 1, "scalar">;
+def XeGPU_ArgTypeKind : I32EnumAttr<"ArgTypeKind",
+ "Argument type for Invoke_SIMD op",
+ [ARG_TYPE_VECTOR, ARG_TYPE_SCALAR]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def MODE_SIMT : I32EnumAttrCase<"SIMT", 0, "simt">;
+def MODE_VC : I32EnumAttrCase<"VC", 1, "vc">;
+def XeGPU_ModeKind : I32EnumAttr<"ModeKind",
+ "The Mode an operator runs on",
+ [MODE_SIMT, MODE_VC]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def MEMORY_SCOPE_GLOBAL: I32EnumAttrCase<"GLOBAL", 0, "global">;
+def MEMORY_SCOPE_SHARED: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind",
+ "The scope of the memory the tensor descritor is created for",
+ [MEMORY_SCOPE_GLOBAL, MEMORY_SCOPE_SHARED]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def CACHE_KIND_CACHED: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write
+def CACHE_KIND_UNCACHED: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write
+def CACHE_KIND_STREAMING: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only
+def CACHE_KIND_INVALIDATE: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only
+def CACHE_KIND_WRITE_BACK: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only
+def CACHE_KIND_WRITE_THROUGH: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only
+
+
+
+def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind",
+ [CACHE_KIND_CACHED, CACHE_KIND_UNCACHED,
+ CACHE_KIND_STREAMING, CACHE_KIND_INVALIDATE,
+ CACHE_KIND_WRITE_BACK, CACHE_KIND_WRITE_THROUGH]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ArgTypeAttr : EnumAttr<XeGPU_Dialect, XeGPU_ArgTypeKind, "arg_type_kind">;
+def XeGPU_ModeAttr : EnumAttr<XeGPU_Dialect, XeGPU_ModeKind, "mode_kind">;
+def XeGPU_MemoryScopeAttr : EnumAttr<XeGPU_Dialect, XeGPU_MemoryScopeKind, "memory_scope_kind">;
+def XeGPU_CacheAttr : EnumAttr<XeGPU_Dialect, XeGPU_CacheKind, "cache_kind">;
+
+// RMW kind attribute
+def ATOMIC_RMW_KIND_ADDF : I32EnumAttrCase<"addf", 0>;
+def ATOMIC_RMW_KIND_ADDI : I32EnumAttrCase<"addi", 1>;
+def ATOMIC_RMW_KIND_ASSIGN : I32EnumAttrCase<"assign", 2>;
+def ATOMIC_RMW_KIND_MAXF : I32EnumAttrCase<"maxf", 3>;
+def ATOMIC_RMW_KIND_MAXS : I32EnumAttrCase<"maxs", 4>;
+def ATOMIC_RMW_KIND_MAXU : I32EnumAttrCase<"maxu", 5>;
+def ATOMIC_RMW_KIND_MINF : I32EnumAttrCase<"minf", 6>;
+def ATOMIC_RMW_KIND_MINS : I32EnumAttrCase<"mins", 7>;
+def ATOMIC_RMW_KIND_MINU : I32EnumAttrCase<"minu", 8>;
+def ATOMIC_RMW_KIND_MULF : I32EnumAttrCase<"mulf", 9>;
+def ATOMIC_RMW_KIND_MULI : I32EnumAttrCase<"muli", 10>;
+def ATOMIC_RMW_KIND_ORI : I32EnumAttrCase<"ori", 11>;
+def ATOMIC_RMW_KIND_ANDI : I32EnumAttrCase<"andi", 12>;
+
+def XeGPU_AtomicRMWKind : I32EnumAttr<"AtomicRMWKind",
+ "Operation type for AtomicRMW",
+ [ATOMIC_RMW_KIND_ADDF, ATOMIC_RMW_KIND_ADDI, ATOMIC_RMW_KIND_ASSIGN,
+ ATOMIC_RMW_KIND_MAXF, ATOMIC_RMW_KIND_MAXS, ATOMIC_RMW_KIND_MAXU,
+ ATOMIC_RMW_KIND_MINF, ATOMIC_RMW_KIND_MINS, ATOMIC_RMW_KIND_MINU,
+ ATOMIC_RMW_KIND_MULF, ATOMIC_RMW_KIND_MULI, ATOMIC_RMW_KIND_ORI,
+ ATOMIC_RMW_KIND_ANDI]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+def XeGPU_AtomicRMWKindAttr : EnumAttr<XeGPU_Dialect, XeGPU_AtomicRMWKind, "atomic_rmw_kind">;
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
new file mode 100644
index 00000000000000..f85ccb32cc43b0
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -0,0 +1,46 @@
+//===- XeGPUDialect.td - XeGPU dialect definition -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/CopyOpInterface.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ShapedOpInterfaces.td"
+
+def XeGPU_Dialect : Dialect {
+ let name = "xegpu";
+ let cppNamespace = "::mlir::xegpu";
+ let summary = "The XeGPU dialect that models Intel GPU's ISA";
+ let description = [{
+ The XeGPU dialect models Intel Xe ISA semantics but works at vector and
+ TensorDesc data type. It provides 1:1 mappings to match Xe instructions
+ like DPAS and 2D block load. The matrix size being processed at this level
+ exactly matches the hardware instructions or the intrinsic supported by
+ the lower-level GPU compiler.
+ }];
+
+ let dependentDialects = [
+ "arith::ArithDialect",
+ "memref::MemRefDialect"
+ ];
+
+ let useDefaultTypePrinterParser = true;
+ let useDefaultAttributePrinterParser = true;
+}
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
new file mode 100644
index 00000000000000..766590f6a3f878
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -0,0 +1,505 @@
+//===- XeGPUOps.td - XeGPU dialect operations definition ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+
+include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
+include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
+
+
+// Base class for dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+// * The parent dialect of the operation.
+// * The mnemonic for the operation, or the name without the dialect prefix.
+// * A list of traits for the operation.
+class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
+ Op<XeGPU_Dialect, mnemonic, traits>;
+
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> {
+
+ let summary = "create nd tensor descriptor operation";
+ let description = [{
+ The "create_nd_tdesc" operation creates a TensorDescType which represents
+ a sub-view of a 2D memory region (It can be extended to support N-D memory
+ region if needed in future). Elements in the subview continuous in each
+ dimention. It encodes the following important information for supporting
+ Intel hardware features:
+
+ * source: an object representing (starting address/pointer of) a 2D memory reagion.
+ It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+ * offsets: two index values represents offsets from the "source" at the each dimension
+ at which the subview of the target memory will be created. It is encoded via two
+ variables, including "dynamic_offsets" and "static_offsets", such that it can
+ accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+ * shape: the shape information of the memory region pointed by the "source". It is
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+ But if "source" is simply a pointer represented as uint64_t type, or a memref
+ type without shape information e.g., memref<?x?xf16>, the shape information has
+ to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape"
+ only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+ * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ it is typically encoded via the MemRefType of the source too. But if "source" is
+ simply a pointer represented as uint64_t type, or a memref type without shape
+ information e.g., memref<?x?xf16>, the strides information has to be explicitly
+ passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+ Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc() : memref<32x24xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1]: memref<32x24xf32> -> TensorDesc<8x16xf32>
+
+ Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+ Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+ %0 = ... : ui64
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c1], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+ }];
+
+ let arguments = (ins XeGPU_BaseAddrType: $source,
+ Variadic<Index>: $dynamic_offsets,
+ Variadic<Index>: $dynamic_shape,
+ Variadic<Index>: $dynamic_strides,
+ DenseI64ArrayAttr: $static_offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+ let hasCustomAssemblyFormat = 1;
+ let skipDefaultBuilders = 1;
+ let hasVerifier = 1;
+
+ let builders = [
+ OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets,
+ "ValueRange": $shape, "ValueRange": $strides,
+ "llvm::ArrayRef<int64_t>": $static_offsets,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
+
+ OpBuilder<(ins "Type": $tdesc, "Value": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>,
+
+ OpBuilder<(ins "Type": $tdesc, "Value": $source,
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
+ "ValueRange": $shape, "ValueRange": $stride,
+ CArg<"xegpu::ModeKind", "xegpu::ModeKind::SIMT">: $mode)>
+ ];
+
+ let extraClassDeclaration = [{
+ /// Returns the type of the source memref operand.
+ Type getSourceType() {
+ return getSource().getType();
+ }
+
+ /// Returns the type of the result TensorDesc.
+ xegpu::TensorDescType getTensorDescType();
+
+ /// Returns the offsets info to the source. It consolidates
+ /// information from both dynamic_offsets and static_offsets
+ /// parameters. static_offsets parameter always has the expected
+ /// ranks with some dim could have ShapeType::kDynamic value
+ /// indicating the corresponding value should be from dynamic_offsets.
+ llvm::SmallVector<OpFoldResult> getOffsets();
+
+ /// returns the shape info of the source. It is either from the
+ /// memref type, if source is a memref with static shape
+ /// information or from the dynamic_shape parameter. If both
+ /// exists, the dynamic_shape parameter will be used and the
+ /// shape information from memref type will be ignored.
+ llvm::SmallVector<OpFoldResult> getShape();
+
+ /// returns the strides info of the source. It is either from the
+ /// memref type, if source is a memref with static shape
+ /// information or from the dynamic_stride parameter. If both
+ /// exists, the dynamic_strides parameter will be used and the
+ /// strides information from memref type will be ignored.
+ llvm::SmallVector<OpFoldResult> getStrides();
+
+ /// return the shape embeded in the memref type of the source.
+ /// If source is not memref type. array of kDynamic will be returned.
+ llvm::ArrayRef<int64_t> getStaticShape();
+
+ /// return the strides embeded in the memref type of the source.
+ /// If source is not memref type. array of kDynamic will be returned.
+ llvm::ArrayRef<int64_t> getStaticStrides();
+
+ /// Return the element type of the TensorDesc
+ Type getElementType();
+
+ /// Return the shape of the TensorDesc
+ llvm::ArrayRef<int64_t> getTensorDescShape();
+ }];
+
+}
+
+def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
+ "to registers (represented by vector)";
+ let description = [{
+ LoadNDOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of cache hints
+ for each level of cache, L1, L2 and L3. If hardware does not have a
+ correspoding cache, Corresponding cache hint attribute will be masked.
+ If both transpose and vnni_axis present at the same time. It assume to
+ perform transpose first and then vnni transform.
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<I64Attr>: $vnni_axis,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_ValueType: $value);
+
+ let extraClassDeclaration = [{
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
+
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
+
+ // Format: xegpu.load_nd %1 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming}
+ // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> {
+ let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+ let arguments = (ins XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ // Format: xegpu.store_nd %3, %2 {l1_hint = write_back, l2_hint = uncached}
+ // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchNDOp : XeGPU_Op<"prefetch_nd", []> {
+ let summary = "prefetches a nD block to cache";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ // Format: xegpu.prefetch_nd %tdesc {l1_hint = cached, l2_hint = uncached}:
+ // !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
+ let summary = "update the offsets for the given tensor descriptor";
+
+ let arguments = (ins
+ XeGPU_TensorDesc: $TensorDesc,
+ Variadic<Index>: $offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure]> {
+ let summary = "create scattered tensor descritors (TensorDesc).";
+ let description = [{
+ "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+ a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+ is for creating continious subviews, "create_tdesc" is for creating non-continious
+ (scattered) subviews. It is designed only works with VectorCompute (VC) mode and
+ accepts the following parameters:
+
+ * source: a 1D memref or pointer (uint64_t) represents the memory object.
+ * offsets: It is a 1D vector containing offsets of each access point, the supportted
+ group size, e.g., vector<16xindex>. And each element in the vector corresponds
+ to a work item (SIMT lane) in the subgroup.
+ * chunk_size_per_lane: [optional attribute] indicates number of continious elements
+ accessed for each offset, default is 1.
+
+ Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+ %a = memref.alloc() : memref<1024xf32>
+ %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+ %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<4xf32>
+
+ Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+ It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+ %0 = memref.alloc() : memref<1024xf32>
+ %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
+ %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+ }];
+
+ let arguments = (ins XeGPU_BaseAddrType: $source,
+ XeGPU_OffsetType: $offsets,
+ DefaultValuedAttr<I64Attr, "1">: $chunk_size_per_lane,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+ let builders = [
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+ "Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane)>,
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+ "Value": $offsets, "IntegerAttr": $chunk_size_per_lane)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
+ // : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_LoadGatherOp : XeGPU_Op<"load"> {
+ let summary = "load a scalar at source[offset].";
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<I64Attr>: $vnni_axis,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_ValueType: $value);
+
+ let builders = [
+ OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc,
+ "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+ CArg<"mlir::DenseI64ArrayAttr", "mlir::DenseI64ArrayAttr()">: $transpose,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+
+ OpBuilder<(ins "mlir::Type": $value, "mlir::Value": $TensorDesc,
+ "mlir::Value": $mask, "mlir::IntegerAttr": $vnni_axis,
+ CArg<"DenseI64ArrayAttr", "DenseI64ArrayAttr()">: $transpose,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
+ // : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> {
+ let summary = "store a scalar to source[offset].";
+
+ let arguments = (ins
+ XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+ );
+
+ let builders = [
+ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+ OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "Value": $mask,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::WRITE_BACK">: $l3_hint)>
+ ];
+ let skipDefaultBuilders = 1;
+
+ // Format: %3 = xegpu.load %1, %0 {l1_hint = cached, l2_hint = uncached}
+ // : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+ let summary = "prefetches a nD block to cache";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheAttr>: $l3_hint,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+
+ let builders = [
+ OpBuilder<(ins "Value": $TensorDesc,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l1_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l2_hint,
+ CArg<"xegpu::CacheKindAttr", "xegpu::CacheKindAttr()">: $l3_hint)>,
+ OpBuilder<(ins "Value": $TensorDesc,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l1_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l2_hint,
+ CArg<"xegpu::CacheKind", "xegpu::CacheKind::CACHED">: $l3_hint)>
+ ];
+
+ let skipDefaultBuilders = 1;
+ let hasVerifier = 1;
+
+ // Format: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}:
+ // !xegpu.tensor_desc<8x16xf16>
+ let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", []> {
+ let summary = "update the offsets for the given tensor descriptor";
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_OffsetType: $offsets,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_TensorDesc: $result);
+
+ let builders = [
+ OpBuilder<(ins "Type": $result, "Value": $TensorDesc, "Value": $offsets)>
+ ];
+
+ let skipDefaultBuilders = 1;
+ let hasCustomAssemblyFormat = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_DpasOp : XeGPU_Op<"dpas"> {
+ let summary = "performs dpas computation";
+ let arguments = (ins
+ XeGPU_DpasOpType : $lhs,
+ XeGPU_DpasOpType : $rhs,
+ Optional<XeGPU_Vector2DType>: $acc,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+ );
+ let results = (outs XeGPU_Vector2DType: $result);
+ let hasCustomAssemblyFormat = 1;
+
+ let extraClassDeclaration = [{
+ VectorType getLhsType() {
+ return ::llvm::cast<VectorType>(getLhs().getType());
+ }
+
+ VectorType getRhsType() {
+ return ::llvm::cast<VectorType>(getRhs().getType());
+ }
+
+ VectorType getAccType() {
+ return ::llvm::cast<VectorType>(getAcc().getType());
+ }
+
+ VectorType getResultType() {
+ return getResult().getType();
+ }
+ }];
+
+ let hasVerifier = 1;
+}
+
+def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> {
+ let summary = "Invoke_SIMD operation";
+ let description = [{
+ The `xegpu.invoke_SIMD` operation works similar to a direct call to a function.
+ But it is special to Intel GPU.
+ }];
+
+ let arguments = (ins FlatSymbolRefAttr:$callee,
+ Variadic<AnyType>:$operands,
+ XeGPU_ArgTypeAttr: $argType);
+ let results = (outs Variadic<AnyType>);
+
+ let builders = [
+ OpBuilder<(ins "SymbolRefAttr":$callee, "TypeRange":$results,
+ "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
+ OpBuilder<(ins "StringAttr":$callee, "TypeRange":$results,
+ "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>,
+ OpBuilder<(ins "llvm::StringRef":$callee, "TypeRange":$results,
+ "xegpu::ArgTypeKindAttr":$argType, CArg<"ValueRange", "{}">:$operands)>
+ ];
+}
+
+def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> {
+ let summary = "perform ready-modify-write operation that is free from data races.";
+ let arguments = (ins
+ XeGPU_AtomicRMWKindAttr:$kind,
+ XeGPU_TensorDesc:$tensorDesc,
+ XeGPU_MaskType:$mask,
+ Optional<XeGPU_ValueType>:$value,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode
+ );
+
+ let results = (outs XeGPU_ValueType:$result);
+ let hasCustomAssemblyFormat = 1;
+
+ let builders = [
+ OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKindAttr": $kind,
+ "Value": $tensorDesc, "Value": $mask, "Value": $value)>,
+ OpBuilder<(ins "Type": $result, "xegpu::AtomicRMWKind": $kind,
+ "Value": $tensorDesc, "Value": $mask, "Value": $value)>
+ ];
+
+ let skipDefaultBuilders = 1;
+ let hasVerifier = 1;
+}
+
+def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
+ let summary = "allocate a specific number of named barriers.";
+ let arguments = (ins I64Attr: $nbarrierCount);
+ let assemblyFormat = "$nbarrierCount attr-dict";
+}
+
+
+def XeGPU_CreateNbarrierOp: XeGPU_Op<"create_nbarrier", []> {
+ let summary = "create a named barrier.";
+ let arguments = (ins I8: $nbarrier_id,
+ I8: $nbarrier_role,
+ I8Attr: $num_producers,
+ I8Attr: $num_consumers,
+ DefaultValuedAttr<XeGPU_ModeAttr, "xegpu::ModeKind::SIMT">: $mode);
+ let results = (outs XeGPU_Nbarrier: $result);
+ let hasCustomAssemblyFormat = 1;
+}
+
+def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> {
+ let summary = "arrive at a named barrier.";
+ let arguments = (ins XeGPU_Nbarrier: $payload);
+ let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload))}];
+}
+
+def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> {
+ let summary = "wait for a named barrier.";
+ let arguments = (ins XeGPU_Nbarrier: $payload);
+ let assemblyFormat = [{ $payload attr-dict `:` qualified(type($payload)) }];
+}
+
+def XeGPU_CompileHintOp: XeGPU_Op<"compile_hint", []> {
+ let summary = "prevents the compiler from scheduling.";
+ let assemblyFormat = [{ attr-dict }];
+}
+
+def XeGPU_MfenceOp: XeGPU_Op<"mfence", []> {
+ let summary = "lsc fence.";
+ let arguments = (ins StrAttr: $memory_kind,
+ StrAttr: $fence_op,
+ StrAttr: $fence_scope);
+ let assemblyFormat = [{ attr-dict }];
+}
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
new file mode 100644
index 00000000000000..b3dceff9587ada
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -0,0 +1,170 @@
+//===- XeGPUTypes.td - XeGPU dialect types definition -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
+#define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
+
+include "mlir/IR/BuiltinTypes.td"
+
+include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
+include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+
+// An Integer array attribute with fixed 2 elements.
+def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
+def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
+def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
+def XeGPU_BaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1, 2]>, UI64, UI32, I64, I32]>;
+def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
+// def XeGPU_OffsetType: AnyTypeOf<[VectorOfRankAndType<[1], [Index]>, Index]>;
+def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
+def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1,2], [I1]>, I1]>;
+def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
+
+def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>;
+
+// common base class for types in XeGPU dialect
+class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
+ string baseCppClass = "::mlir::Type">
+ : TypeDef<XeGPU_Dialect, name, traits, baseCppClass> {
+ let mnemonic = typeMnemonic;
+}
+
+// TensorDesc contains dim and element type info
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+ [ShapedTypeInterface], "::mlir::TensorType"> {
+ let summary = "TensorDesc describing all kinds of memory and tensors, including scatter tensor, 1d tensor, 2d tensor, … 5d tensor";
+ let description = [{
+ TensorDesc is a type designed to describe all kinds of memory, scatter tensor, 1d tensor, 2d tensor, … 5d tensor.
+ Different with the builtin tensor type in MLIR, it essentially only contains the meta data that describes a region
+ of the intereted data as well as some features that are unique to intel hardware features. It does not hold the data
+ directly by itself. It is designed to mainly support 2d block load/store and DPAS (matrix multiplication instruction)
+ on Intel GPU. It majorly encodes the following information:
+
+ * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+ and each row contains 16 continious data element. The rows could be
+ either continuous or not, depends on whether the encoding attribute
+ is set or not.
+ * element_type: the data type of the data element, e.g., f16, f32.
+
+ Similar to the builtin tensor, it also provides an optinal attribute to encoding the following information via the TensorDescAttr object:
+ * memory_scope (xegpu::MemoryScope): [optional] where the data is located, global memory or shared memory. It is default to Global.
+ * array_length (int): [optional] The number of continuous blocks with size as `shape`,
+ that will be loaded by block load at a time. It is default to 1.
+ * boundary_check (bool): [optional] indicates whether the operation detects the boundary and pads with zero for out-of-boundary access (default)
+ * scattered (xegpu::ScatteredAttr): [optional] It is a unit attribute. It can be only set as empty or ScatteredAttr, indicating
+ whether the TensorDesc is blocked (empty, default) or scattered (ScatteredAttr). If it is
+ blocked, rows are continuous in the correspoding dimention, otherwise, rows may be not continous.
+ * mapping (xegpu::SubGroupMapAttr): [optional] Used to guide compiler to distribute the workload into different threads. It is default to none.
+
+ For convinience, its attribute field can also take either "ScatteredAttr" or "SubGroupMapAttr" directly if and only
+ if others are taking default values.
+
+ Syntax:
+
+ ```
+ TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+ element-type ::= float-type | integer-type | index-type
+ dim-list := (static-dim-list `x`)?
+ static-dim-list ::= decimal-literal `x` decimal-literal
+ attr-list = (, memory_scope = value)? (, arr_len = value)? (, ScatteredAttr)? (, mapping)?
+ ```
+
+ Examples:
+
+ ```mlir
+ // A block TensorDesc with 3x42 i32 elements
+ xegpu.tensor_desc<3x42xi32>
+
+ // A block TensorDesc with 4x5 f32 elements
+ xegpu.tensor_desc<4x5xf32>
+
+ // A Scattered TensorDesc with 16x4 f32 elements
+ xegpu.tensor_desc<16x4xf32, #!xegpu.scattered>
+
+ // A TensorDesc with 8x16 f16 elements.
+ // It will be distributed accross 16 hardware threads, organized as [2, 8],
+ // and each access 2 continious elements in dim 1.
+ xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+
+ // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+ xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ ```
+ }];
+
+ let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+ "mlir::Type": $elementType,
+ OptionalParameter<"mlir::Attribute">: $encoding);
+
+ let builders = [
+ TypeBuilderWithInferredContext<(ins
+ "llvm::ArrayRef<int64_t>":$shape, "mlir::Type":$elementType,
+ CArg<"mlir::Attribute", "{}"> : $encoding
+ )>,
+ TypeBuilder<(ins
+ "llvm::ArrayRef<int64_t>": $shape, "mlir::Type": $elementType,
+ "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length,
+ "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered,
+ "mlir::xegpu::SubGroupMapAttr": $mapping
+ )>,
+ TypeBuilderWithInferredContext<(ins
+ "llvm::ArrayRef<int64_t>": $shape, "mlir::Type": $elementType,
+ "mlir::xegpu::MemoryScopeKind": $memory_scope, "int": $array_length,
+ "bool": $boundary_check, "mlir::xegpu::ScatteredAttr": $scattered,
+ "mlir::xegpu::SubGroupMapAttr": $mapping
+ )>
+ ];
+
+ let extraClassDeclaration = [{
+ using TensorType::clone;
+ using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+ using mlir::ShapedType::Trait<TensorDescType>::getRank;
+ using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+ using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+ using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+ using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+ using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+ using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+ TensorDescType clone(::mlir::Type elementType) {
+ return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+ }
+
+ TensorDescAttr getEncodingAsTensorDescAttr() const {
+ return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+ }
+
+ SubGroupMapAttr getEncodingAsMapAttr() const {
+ return llvm::dyn_cast_if_present<SubGroupMapAttr>(getEncoding());
+ }
+
+ ScatteredAttr getEncodingAsScatteredAttr() const {
+ return llvm::dyn_cast_if_present<ScatteredAttr>(getEncoding());
+ }
+
+ xegpu::MemoryScopeKind getMemoryScope();
+ int getArrayLength();
+ bool getBoundaryCheck();
+ xegpu::ScatteredAttr getScattered();
+ xegpu::SubGroupMapAttr getMapping();
+ }];
+
+ let hasCustomAssemblyFormat = true;
+}
+
+
+def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
+ let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier.";
+
+ let extraClassDeclaration = [{
+ static NbarrierType get(mlir::MLIRContext *context) {
+ return Base::get(context);
+ };
+ }];
+}
+
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 19a62cadaa2e04..838b7b87b09b64 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -87,6 +87,7 @@
#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Dialect.h"
#include "mlir/Interfaces/CastInterfaces.h"
#include "mlir/Target/LLVM/NVVM/Target.h"
@@ -138,7 +139,8 @@ inline void registerAllDialects(DialectRegistry ®istry) {
transform::TransformDialect,
ub::UBDialect,
vector::VectorDialect,
- x86vector::X86VectorDialect>();
+ x86vector::X86VectorDialect,
+ xegpu::XeGPUDialect>();
// clang-format on
// Register all external models.
diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt
index 68776a695cac4d..f5eeaaed5af97d 100644
--- a/mlir/lib/Dialect/CMakeLists.txt
+++ b/mlir/lib/Dialect/CMakeLists.txt
@@ -39,6 +39,7 @@ add_subdirectory(UB)
add_subdirectory(Utils)
add_subdirectory(Vector)
add_subdirectory(X86Vector)
+add_subdirectory(XeGPU)
set(LLVM_OPTIONAL_SOURCES
Traits.cpp
diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
new file mode 100644
index 00000000000000..f33061b2d87cff
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
new file mode 100644
index 00000000000000..2e99f39ed86d2e
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_dialect_library(MLIRXeGPUDialect
+ XeGPUDialect.cpp
+ XeGPUOps.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${PROJECT_SOURCE_DIR}/include/mlir/Dialect/XeGPU
+
+ DEPENDS
+ MLIRXeGPUIncGen
+ MLIRXeGPUAttrsIncGen
+ MLIRXeGPUEnumsIncGen
+
+ LINK_LIBS PUBLIC
+ MLIRIR
+)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
new file mode 100644
index 00000000000000..60ab50227c2247
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -0,0 +1,385 @@
+//===- XeGPUDialect.cpp - MLIR XeGPU dialect implementation -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/ADT/TypeSwitch.h>
+#include <llvm/Support/Debug.h>
+#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/Tensor/IR/Tensor.h>
+#include <mlir/Dialect/Utils/StaticValueUtils.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+
+#include <numeric>
+
+namespace mlir {
+namespace xegpu {
+
+void XeGPUDialect::initialize() {
+ addTypes<
+#define GET_TYPEDEF_LIST
+#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
+ >();
+ addOperations<
+#define GET_OP_LIST
+#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
+ >();
+ addAttributes<
+#define GET_ATTRDEF_LIST
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
+ >();
+}
+
+bool printDefaultValues() {
+ auto *env = getenv("MLIR_XEGPU_PRINT_DEFAULTS");
+ if (env && std::string(env) == "true")
+ return true;
+ return false;
+}
+
+SubGroupMapAttr SubGroupMapAttr::get(mlir::MLIRContext *context,
+ llvm::ArrayRef<int32_t> wiLayout,
+ llvm::ArrayRef<int32_t> wiData) {
+ assert(wiLayout.size() == 2 && wiData.size() == 2 &&
+ "wiLayout and wiData should be 2D arrays.\n");
+ return Base::get(context, mlir::DenseI32ArrayAttr::get(context, wiLayout),
+ mlir::DenseI32ArrayAttr::get(context, wiData));
+}
+
+mlir::LogicalResult SubGroupMapAttr::verify(
+ llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+ mlir::DenseI32ArrayAttr layout, mlir::DenseI32ArrayAttr data) {
+
+ if (layout.size() != 2) {
+ emitError() << "Failed to parse SubGroupMapAttr: missing wi_layout which "
+ "is to be an integer array of size 2.\n";
+ return mlir::failure();
+ }
+
+ if (data.size() != 2) {
+ emitError() << "Failed to parse SubGroupMapAttr: missing wi_data which is "
+ "to be an integer array of size 2.\n";
+ return mlir::failure();
+ }
+
+ return mlir::success();
+}
+
+mlir::Attribute TensorDescAttr::parse(mlir::AsmParser &parser,
+ mlir::Type type) {
+ mlir::FailureOr<xegpu::MemoryScopeKind> memory_scope;
+ mlir::FailureOr<int> array_length;
+ mlir::FailureOr<bool> boundary_check;
+ mlir::FailureOr<xegpu::ScatteredAttr> scattered;
+ mlir::FailureOr<xegpu::SubGroupMapAttr> map;
+
+ bool seen_memory_scope = false;
+ bool seen_array_length = false;
+ bool seen_boundary_check = false;
+ bool seen_scattered = false;
+ bool seen_map = false;
+
+ // Parse literal '<'
+ if (parser.parseLess())
+ return {};
+
+ // Parse elements
+ auto parseElt = [&]() -> mlir::ParseResult {
+ llvm::StringRef paramKey;
+
+ if (!parser.parseOptionalKeyword(¶mKey)) {
+ if (parser.parseEqual())
+ return mlir::failure();
+
+ if (!seen_memory_scope && paramKey == "memory_scope") {
+ seen_memory_scope = true;
+ // Parse variable 'memory_scope'
+ memory_scope =
+ mlir::FieldParser<mlir::xegpu::MemoryScopeKind>::parse(parser);
+ if (mlir::failed(memory_scope))
+ return parser.emitError(
+ parser.getCurrentLocation(),
+ "Failed to parse the 'memory_scope' of TensorDescAttr, which is "
+ "to be a `xegpu::MemoryScope`");
+ } else if (!seen_array_length && paramKey == "array_length") {
+ seen_array_length = true;
+ // Parse variable 'array_length'
+ array_length = ::mlir::FieldParser<int>::parse(parser);
+ if (mlir::failed(array_length))
+ return parser.emitError(parser.getCurrentLocation(),
+ "Failed to parse the 'array_length' of "
+ "TensorDescAttr, which is to be a `int`");
+ } else if (!seen_boundary_check && paramKey == "boundary_check") {
+ seen_boundary_check = true;
+ // Parse variable 'boundary_check'
+ boundary_check = ::mlir::FieldParser<bool>::parse(parser);
+ if (::mlir::failed(boundary_check))
+ return parser.emitError(parser.getCurrentLocation(),
+ "Failed to parse the 'boundary_check' of "
+ "TensorDescAttr, which is to be a `bool`");
+ } else if (!seen_map && paramKey == "map") {
+ seen_map = true;
+ // Parse variable 'map'
+ map = ::mlir::FieldParser<xegpu::SubGroupMapAttr>::parse(parser);
+ if (::mlir::failed(map))
+ return parser.emitError(
+ parser.getCurrentLocation(),
+ "Failed to parse the 'map' of TensorDescAttr, which is to be a "
+ "`xegpu::SubGroupMapAttr`");
+ }
+ } else if (!seen_scattered) {
+ // parse scattered
+ scattered = mlir::FieldParser<xegpu::ScatteredAttr>::parse(parser);
+ if (mlir::failed(scattered))
+ return parser.emitError(
+ parser.getCurrentLocation(),
+ "Failed to parse 'scattered' attr of TensorDescAttr, which is to "
+ "be a `xegpu::ScatteredAttr`");
+ seen_scattered = true;
+ }
+ return mlir::success();
+ };
+
+ if (parser.parseCommaSeparatedList(parseElt))
+ return {};
+
+ // Parse literal '>'
+ if (parser.parseGreater())
+ return {};
+ return TensorDescAttr::get(
+ parser.getContext(),
+ memory_scope.value_or(xegpu::MemoryScopeKind::GLOBAL),
+ array_length.value_or(1), boundary_check.value_or(true),
+ scattered.value_or(xegpu::ScatteredAttr()),
+ map.value_or(xegpu::SubGroupMapAttr()));
+}
+
+void TensorDescAttr::print(::mlir::AsmPrinter &printer) const {
+ bool printSep = false;
+ bool printDefaults = printDefaultValues();
+
+ printer << "<";
+
+ if (printDefaults || getMemoryScope() != xegpu::MemoryScopeKind::GLOBAL) {
+ if (printSep)
+ printer << ", ";
+ printSep = true;
+ printer << "memory_scope = ";
+ printer.printStrippedAttrOrType(getMemoryScope());
+ }
+ if (printDefaults || getArrayLength() != 1) {
+ if (printSep)
+ printer << ", ";
+ printSep = true;
+ printer << "array_length = ";
+ printer.printStrippedAttrOrType(getArrayLength());
+ }
+ if (printDefaults || getBoundaryCheck() != true) {
+ if (printSep)
+ printer << ", ";
+ printSep = true;
+ printer << "boundary_check = ";
+ printer.printStrippedAttrOrType(getBoundaryCheck());
+ }
+ if (getScattered()) {
+ if (printSep)
+ printer << ", ";
+ printSep = true;
+ printer.printStrippedAttrOrType(getScattered());
+ }
+ if (getMap()) {
+ if (printSep)
+ printer << ", ";
+ printSep = true;
+ printer << "map = ";
+ printer.printStrippedAttrOrType(getMap());
+ }
+ printer << ">";
+}
+
+bool TensorDescAttr::hasNonDefaultAttrs() {
+ int count = 0;
+ if (getMemoryScope() != MemoryScopeKind::GLOBAL)
+ count++;
+ if (getBoundaryCheck() != true)
+ count++;
+ if (getArrayLength() != 1)
+ count++;
+ if (getScattered())
+ count++;
+ if (getMap())
+ count++;
+ return count;
+}
+
+TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
+ xegpu::MemoryScopeKind memory_scope,
+ int array_length,
+ xegpu::ScatteredAttr scattered,
+ xegpu::SubGroupMapAttr map) {
+ return Base::get(context, std::move(memory_scope), std::move(array_length),
+ true, std::move(scattered), std::move(map));
+}
+
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+ llvm::SmallVector<int64_t> shape;
+ mlir::Type elementType;
+ mlir::FailureOr<mlir::Attribute> encoding;
+
+ // Parse literal '<'
+ if (parser.parseLess())
+ return {};
+
+ auto shapeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseDimensionList(shape))) {
+ parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+ return {};
+ }
+
+ auto elemTypeLoc = parser.getCurrentLocation();
+ if (mlir::failed(parser.parseType(elementType))) {
+ parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+ return {};
+ }
+
+ // parse optional attributes
+ if (mlir::succeeded(parser.parseOptionalComma())) {
+ encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+ if (mlir::failed(encoding)) {
+ parser.emitError(
+ parser.getCurrentLocation(),
+ "Failed to parse the attribute field for TensorDescType.\n");
+ return {};
+ }
+ }
+
+ // Parse literal '>'
+ if (parser.parseGreater())
+ return {};
+
+ return TensorDescType::get(parser.getContext(), shape, elementType,
+ encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+ printer << "<";
+
+ auto shape = getShape();
+ for (int64_t dim : shape) {
+ if (mlir::ShapedType::isDynamic(dim))
+ printer << '?';
+ else
+ printer << dim;
+ printer << 'x';
+ }
+ printer << getElementType();
+
+ if (printDefaultValues()) {
+ auto encoding = getEncoding();
+ if (auto attr = getEncodingAsMapAttr()) {
+ encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1,
+ {}, attr);
+ }
+ if (auto attr = getEncodingAsScatteredAttr()) {
+ encoding = TensorDescAttr::get(getContext(), MemoryScopeKind::GLOBAL, 1,
+ attr, {});
+ }
+ printer << ", " << encoding;
+ } else if (auto encoding = getEncodingAsTensorDescAttr()) {
+ if (encoding.hasNonDefaultAttrs())
+ printer << ", " << encoding;
+ } else if (auto encoding = getEncoding()) {
+ printer << ", " << encoding;
+ }
+ printer << ">";
+}
+
+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
+ mlir::Type elementType,
+ mlir::Attribute encoding) {
+ return Base::get(elementType.getContext(), shape, elementType, encoding);
+}
+
+TensorDescType TensorDescType::get(mlir::MLIRContext *context,
+ llvm::ArrayRef<int64_t> shape,
+ mlir::Type elementType,
+ mlir::xegpu::MemoryScopeKind memory_scope,
+ int array_length, bool boundary_check,
+ mlir::xegpu::ScatteredAttr scattered,
+ mlir::xegpu::SubGroupMapAttr mapping) {
+ auto attr = TensorDescAttr::get(context, memory_scope, array_length,
+ boundary_check, scattered, mapping);
+ return Base::get(context, shape, elementType, attr);
+}
+
+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
+ mlir::Type elementType,
+ mlir::xegpu::MemoryScopeKind memory_scope,
+ int array_length, bool boundary_check,
+ mlir::xegpu::ScatteredAttr scattered,
+ mlir::xegpu::SubGroupMapAttr mapping) {
+ auto attr =
+ TensorDescAttr::get(elementType.getContext(), memory_scope, array_length,
+ boundary_check, scattered, mapping);
+ return Base::get(elementType.getContext(), shape, elementType, attr);
+}
+
+xegpu::MemoryScopeKind TensorDescType::getMemoryScope() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr)
+ return attr.getMemoryScope();
+ // return default value
+ return MemoryScopeKind::GLOBAL;
+}
+
+int TensorDescType::getArrayLength() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr)
+ return attr.getArrayLength();
+ // return default value
+ return 1;
+}
+
+bool TensorDescType::getBoundaryCheck() {
+ auto attr = getEncodingAsTensorDescAttr();
+ if (attr)
+ return attr.getBoundaryCheck();
+ // return default value
+ return true;
+}
+
+xegpu::ScatteredAttr TensorDescType::getScattered() {
+ if (auto attr = getEncodingAsTensorDescAttr())
+ return attr.getScattered();
+ if (auto attr = getEncodingAsScatteredAttr())
+ return attr;
+ // return default value
+ return {};
+}
+
+xegpu::SubGroupMapAttr TensorDescType::getMapping() {
+ if (auto attr = getEncodingAsTensorDescAttr())
+ return attr.getMap();
+ if (auto attr = getEncodingAsMapAttr())
+ return attr;
+ // return default value
+ return xegpu::SubGroupMapAttr();
+}
+
+} // namespace xegpu
+} // namespace mlir
+
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc>
+#define GET_ATTRDEF_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
+#define GET_TYPEDEF_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc>
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
new file mode 100644
index 00000000000000..627680e84ec949
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -0,0 +1,1929 @@
+//===- XeGPUOps.cpp - MLIR XeGPU ops implementation -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm/ADT/TypeSwitch.h>
+#include <llvm/Support/Debug.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/Tensor/IR/Tensor.h>
+#include <mlir/Dialect/Utils/StaticValueUtils.h>
+#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <numeric>
+#include <type_traits>
+
+#define DEBUG_TYPE "xegpu"
+
+namespace mlir {
+class Token;
+
+namespace xegpu {
+
+extern bool printDefaultValues();
+
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+ std::string buf;
+ buf.clear();
+ llvm::raw_string_ostream os(buf);
+ os << "[";
+ for (size_t i = 1; i < array.size(); i++) {
+ os << array[i - 1] << ", ";
+ if (breakline)
+ os << "\n\t\t";
+ }
+ os << array.back() << "]";
+ os.flush();
+ return buf;
+}
+
+static size_t getRankOf(Value value) {
+ if (value.getType().isIntOrIndexOrFloat())
+ return 0;
+ if (auto ty = llvm::dyn_cast_if_present<MemRefType>(value.getType()))
+ return ty.getRank();
+ if (auto ty = llvm::dyn_cast_if_present<VectorType>(value.getType()))
+ return ty.getRank();
+ llvm_unreachable("Unsupported value for getRankOf");
+}
+
+static void transpose(llvm::ArrayRef<int64_t> trans,
+ std::vector<int64_t> &shape) {
+ std::vector<int64_t> old = shape;
+ for (size_t i = 0; i < trans.size(); i++)
+ shape[i] = old[trans[i]];
+}
+
+static bool verifyAndInferShape(std::vector<int64_t> &shape,
+ SubGroupMapAttr sgMap) {
+ if (sgMap) {
+ auto wiLayout = sgMap.getWiLayout();
+ auto wiData = sgMap.getWiData();
+
+ if ((int64_t)shape.size() != wiData.size() ||
+ (int64_t)shape.size() != wiLayout.size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < shape.size(); i++) {
+
+ if ((shape[i] % (wiLayout[i] * wiData[i]) != 0 &&
+ (wiLayout[i] * wiData[i]) % shape[i] != 0) ||
+ shape[i] % wiLayout[i] != 0 || shape[i] % wiData[i] != 0) {
+ return false;
+ }
+ shape[i] /= wiLayout[i];
+ }
+ }
+
+ return true;
+}
+
+static ParseResult
+parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser,
+ OperationState &result) {
+ // no optional attributes, return success
+ if (failed(parser.parseOptionalLBrace()))
+ return success();
+
+ llvm::SmallDenseSet<StringRef, 8> seenKeys;
+ auto parseElt = [&]() -> ParseResult {
+ // The name of an attribute can either be a keyword, or a string.
+ // as compared to mlir::parseOptionalAttrList, the cases of using
+ // TOken::bare_identifier and Token::inttype as key maybe not handlered
+ std::string nameId;
+ auto loc = parser.getCurrentLocation();
+ if (parser.parseOptionalKeywordOrString(&nameId))
+ return parser.emitError(loc, "invalid attribute name: ")
+ << nameId << ".\n";
+
+ if (nameId.empty())
+ return parser.emitError(loc, "expected valid attribute name");
+
+ if (!seenKeys.insert(nameId).second)
+ return parser.emitError(loc, "duplicate key '")
+ << nameId << "' in dictionary attribute.";
+
+ // Lazy load a dialect in the context if there is a possible namespace.
+ auto splitName = StringRef(nameId).split('.');
+ if (!splitName.second.empty())
+ parser.getContext()->getOrLoadDialect(splitName.first);
+
+ // Try to parse the '=' for the attribute value.
+ if (parser.parseEqual()) {
+ // If there is no '=', it is treated as a unit attribute.
+ result.addAttribute(nameId, parser.getBuilder().getUnitAttr());
+ return success();
+ }
+
+ // for xegpu specific attributes
+ if (nameId == "mode") {
+ ModeKindAttr attr;
+ return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+ result.attributes);
+ } else if (nameId == "l1_hint" || nameId == "l2_hint" ||
+ nameId == "l3_hint") {
+ CacheKindAttr attr;
+ return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId,
+ result.attributes);
+ } else if (nameId == "transpose") {
+ // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse()
+ if (succeeded(parser.parseOptionalLSquare())) {
+ Attribute attr;
+ // handle empty list case
+ if (succeeded(parser.parseOptionalRSquare())) {
+ attr = DenseI64ArrayAttr::get(parser.getContext(), {});
+ } else {
+ attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{});
+ if (failed(parser.parseRSquare()))
+ return failure();
+ }
+ if (!attr)
+ return failure();
+ result.addAttribute(nameId, attr);
+ return success();
+ } else {
+ // in form of array<i64: 4, 5>
+ DenseI64ArrayAttr attr;
+ return parser.parseAttribute(attr, nameId, result.attributes);
+ }
+ } else {
+ Attribute attr;
+ return parser.parseAttribute(attr, nameId, result.attributes);
+ }
+ };
+
+ if (parser.parseCommaSeparatedList(parseElt))
+ return failure();
+
+ return parser.parseRBrace();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+ Type TensorDesc, Value source, ValueRange offsets,
+ ValueRange shape, ValueRange strides,
+ llvm::ArrayRef<int64_t> static_offsets,
+ ModeKind mode) {
+ auto offsetRank = static_offsets.size();
+ auto shapeRank = shape.size() ? shape.size() : getRankOf(source);
+
+ size_t dynOffsetRank =
+ std::count_if(static_offsets.begin(), static_offsets.end(),
+ [](int64_t d) { return ShapedType::isDynamic(d); });
+
+ // shape and strides should exists at the same time
+ // and the final rank for shape and offset (dynamic + static)
+ // should be the same
+ assert(shape.size() == strides.size() && shapeRank == offsetRank &&
+ offsets.size() == dynOffsetRank);
+
+ state.addOperands(source);
+ state.addOperands(offsets);
+ state.addOperands(shape);
+ state.addOperands(strides);
+ state.addAttribute(
+ getOperandSegmentSizesAttrName(state.name),
+ builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()),
+ static_cast<int32_t>(shape.size()),
+ static_cast<int32_t>(strides.size())}));
+ state.addAttribute(getStaticOffsetsAttrName(state.name),
+ builder.getDenseI64ArrayAttr(static_offsets));
+ state.addAttribute(getModeAttrName(state.name),
+ xegpu::ModeKindAttr::get(builder.getContext(), mode));
+ state.addTypes(TensorDesc);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+ Type tdesc, Value source,
+ llvm::ArrayRef<OpFoldResult> offsets,
+ ModeKind mode) {
+ auto ty = llvm::dyn_cast_if_present<MemRefType>(source.getType());
+ assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source));
+
+ llvm::SmallVector<int64_t> staticOffsets;
+ llvm::SmallVector<Value> dynamicOffsets;
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+ build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+ ValueRange({}) /* empty dynamic shape */,
+ ValueRange({}) /* empty dynamic strides */,
+ staticOffsets /* static offsets */, mode);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+ Type tdesc, Value source,
+ llvm::ArrayRef<OpFoldResult> offsets,
+ ValueRange shape, ValueRange stride, ModeKind mode) {
+ assert(shape.size() && offsets.size() && stride.size() &&
+ shape.size() == stride.size() && shape.size() == offsets.size());
+
+ llvm::SmallVector<int64_t> staticOffsets;
+ llvm::SmallVector<Value> dynamicOffsets;
+
+ dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+ build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+ shape /* dynamic shape */, stride /* dynamic strides */,
+ staticOffsets /* static offsets */, mode);
+}
+
+ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) {
+ // parse the source operand
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> sourceOperands(1);
+ llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(sourceOperands[0]))
+ return failure();
+
+ // parse the offset operand, in format of [x, y]
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> offsetsOperands;
+ DenseI64ArrayAttr static_offsetsAttr;
+ llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation();
+ if (parseDynamicIndexList(parser, offsetsOperands, static_offsetsAttr))
+ return failure();
+ result.addAttribute("static_offsets", static_offsetsAttr);
+
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> shapeOperands;
+ llvm::SMLoc shapeOperandsLoc;
+
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> stridesOperands;
+ llvm::SMLoc stridesOperandsLoc;
+ // parse optional shape and strides, shape and strides should always come
+ // together
+ if (succeeded(parser.parseOptionalComma())) {
+ // parse shape part, in form of [x, y]
+ if (parser.parseLSquare())
+ return failure();
+ shapeOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(shapeOperands))
+ return failure();
+ if (parser.parseRSquare())
+ return failure();
+
+ if (parser.parseComma())
+ return failure();
+
+ // parse stride part, in form of [x, y]
+ if (parser.parseLSquare())
+ return failure();
+ stridesOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(stridesOperands))
+ return failure();
+ if (parser.parseRSquare())
+ return failure();
+ }
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ llvm::SmallVector<Type> sourceTypes(1);
+ if (parser.parseType(sourceTypes[0]))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> TensorDescTypes(1);
+ if (parser.parseType(TensorDescTypes[0]))
+ return failure();
+ result.addAttribute("operandSegmentSizes",
+ parser.getBuilder().getDenseI32ArrayAttr(
+ {1, static_cast<int32_t>(offsetsOperands.size()),
+ static_cast<int32_t>(shapeOperands.size()),
+ static_cast<int32_t>(stridesOperands.size())}));
+
+ result.addTypes(TensorDescTypes);
+ if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc,
+ result.operands))
+ return failure();
+
+ Type indexType = parser.getBuilder().getIndexType();
+ if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
+ result.operands))
+ return failure();
+ if (parser.resolveOperands(shapeOperands, indexType, shapeOperandsLoc,
+ result.operands))
+ return failure();
+ if (parser.resolveOperands(stridesOperands, indexType, stridesOperandsLoc,
+ result.operands))
+ return failure();
+ return success();
+}
+
+void CreateNdDescOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getSource();
+ printDynamicIndexList(printer, *this, getDynamicOffsets(),
+ getStaticOffsetsAttr());
+ if (!getDynamicShape().empty()) {
+ printer << ",";
+ printer << ' ' << "[";
+ printer << getDynamicShape();
+ printer << "]";
+ }
+
+ if (!getDynamicStrides().empty()) {
+ printer << ",";
+ printer << ' ' << "[";
+ printer << getDynamicStrides();
+ printer << "]";
+ }
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ elidedAttrs.push_back("static_offsets");
+ elidedAttrs.push_back("operandSegmentSizes");
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getSourceType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getTensorDescType();
+}
+
+LogicalResult CreateNdDescOp::verify() {
+ auto mode = getMode();
+ auto isScattered = getTensorDescType().getScattered();
+ auto mapping = getTensorDescType().getMapping();
+
+ if (isScattered) {
+ return emitOpError("Encoding Attribute of TensorDesc is not expected for "
+ "non-scattered operators.\n");
+ }
+
+ if (mode == ModeKind::VC && mapping) {
+ return emitOpError("Mapping attribute of TensorDesc is not expected "
+ "for VC mode operations.\n");
+ }
+
+ if (mode == ModeKind::SIMT && !mapping) {
+ return emitOpError("Expecting SgMap attribute for SIMT mode operators.\n");
+ }
+
+ auto offsetRank = getOffsets().size();
+ auto shapeRank = getShape().size();
+ auto stridesRank = getStrides().size();
+ auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2;
+
+ if (offsetRank != shapeRank || shapeRank != stridesRank ||
+ shapeRank != baseRank)
+ return emitOpError(
+ "Expecting the rank of shape, strides, offsets and memref type "
+ "should match with each other (they currently should be 2D).");
+
+ return success();
+}
+
+xegpu::TensorDescType CreateNdDescOp::getTensorDescType() {
+ return getTensorDesc().getType();
+}
+
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getOffsets() {
+ llvm::SmallVector<OpFoldResult> offsets;
+ auto dynamicOffsets = getDynamicOffsets(); // given by dynamic_offsets
+ // variable
+ auto staticOffsets = getStaticOffsets(); // given by static_offsets attribute
+
+ // in case static_offsets is missing
+ if (staticOffsets.size() == 0) {
+ offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
+ return offsets;
+ }
+
+ for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
+ if (ShapedType::isDynamic(staticOffsets[i])) {
+ assert(j < dynamicOffsets.size());
+ offsets.push_back(dynamicOffsets[j++]);
+ } else {
+ auto ty = IndexType::get(getContext());
+ auto attr = IntegerAttr::get(ty, staticOffsets[i]);
+ offsets.push_back(attr);
+ }
+ }
+ return offsets;
+}
+
+llvm::ArrayRef<int64_t> CreateNdDescOp::getStaticShape() {
+ auto rank = getTensorDescType().getRank();
+ static llvm::SmallVector<int64_t> dyn(rank, ShapedType::kDynamic);
+ auto srcTy = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+ if (srcTy)
+ return srcTy.getShape();
+
+ return dyn;
+}
+
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getShape() {
+ llvm::SmallVector<OpFoldResult> shape;
+ auto dynShape = getDynamicShape();
+ if (dynShape.size()) {
+ shape.append(dynShape.begin(), dynShape.end());
+ return shape;
+ }
+
+ auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+ if (ty && ty.hasStaticShape()) {
+ for (auto dim : ty.getShape()) {
+ auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+ shape.push_back(attr);
+ }
+ return shape;
+ }
+
+ llvm_unreachable("Unexpected error in CreateNdDescOp. "
+ "The shape information is missing.\n");
+}
+
+llvm::ArrayRef<int64_t> CreateNdDescOp::getStaticStrides() {
+ auto rank = getTensorDescType().getRank();
+ static llvm::SmallVector<int64_t> dyn(rank, ShapedType::kDynamic);
+ auto srcTy = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+ if (srcTy) {
+ auto [strides, offset] = getStridesAndOffset(srcTy);
+ return strides;
+ }
+ return dyn;
+}
+
+llvm::SmallVector<OpFoldResult> CreateNdDescOp::getStrides() {
+ llvm::SmallVector<OpFoldResult> strides;
+
+ auto dynStrides = getDynamicStrides();
+ if (dynStrides.size()) {
+ strides.append(dynStrides.begin(), dynStrides.end());
+ return strides;
+ }
+
+ auto ty = llvm::dyn_cast_if_present<MemRefType>(getSourceType());
+ if (ty && ty.hasStaticShape()) {
+ auto [staticStrides, offset] = getStridesAndOffset(ty);
+ for (auto dim : staticStrides) {
+ auto attr = IntegerAttr::get(IndexType::get(getContext()), dim);
+ strides.push_back(attr);
+ }
+ return strides;
+ }
+ llvm_unreachable("Unexpected error in CreateNdDescOp. The strides "
+ "information is missing.\n");
+}
+
+/// Return the element type of the TensorDesc
+Type CreateNdDescOp::getElementType() {
+ return getTensorDescType().getElementType();
+}
+
+/// Return the shape of the TensorDesc
+llvm::ArrayRef<int64_t> CreateNdDescOp::getTensorDescShape() {
+ return getTensorDescType().getShape();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNDOp
+//===----------------------------------------------------------------------===//
+
+ParseResult LoadNDOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(1);
+ llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(Operands[0]))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ llvm::SmallVector<Type> Types(1);
+ if (parser.parseType(Types[0]))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> valueTypes(1);
+ if (parser.parseType(valueTypes[0]))
+ return failure();
+
+ result.addTypes(valueTypes);
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+
+ return success();
+}
+
+void LoadNDOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getValue().getType();
+}
+
+LogicalResult LoadNDOp::verify() {
+ auto tdescTy = getTensorDescType();
+ auto valueTy = getValueType();
+
+ if (tdescTy.getRank() != 2)
+ return emitOpError(
+ "The TensorDesc for LoadNDOp should be a 2D TensorDesc.");
+
+ if (!valueTy)
+ return emitOpError("Invalid result, it should be a VectorType.\n");
+
+ auto tdescElemTy = tdescTy.getElementType();
+ auto valueElemTy = valueTy.getElementType();
+
+ if (tdescElemTy != valueElemTy)
+ return emitOpError(
+ "Value should have the same element type as TensorDesc.");
+
+ auto mode = getMode();
+ auto tdescShape = tdescTy.getShape().vec();
+ auto valueShape = valueTy.getShape().vec();
+ auto array_len = tdescTy.getArrayLength();
+
+ if (mode == ModeKind::SIMT) {
+ auto sgMap = tdescTy.getMapping();
+ if (!sgMap) {
+ return emitOpError(
+ "Expecting SgMap attribute for SIMT mode operators.\n");
+ }
+
+ if (!verifyAndInferShape(tdescShape, sgMap)) {
+ return emitOpError("Failed to infer the shape.")
+ << "The new shape[i] should meet the following condistions "
+ "for SubGroupMapAttr: "
+ << "\n\ttdescShape[i] % mma_block_size[i] == 0 (if it has) && "
+ << "\n\ttdescShape[i] % wi_layout[i] == 0 && "
+ << "\n\ttdescShape[i] % wi_data[i] == 0 && "
+ << "\n\t(tdescShape[i] % (wi_layout[i] * wi_data[i]) == 0 || "
+ << "\n\t (wi_layout[i] * wi_data[i]) % tdescShape[i] == 0).\n";
+ }
+ }
+
+ if (getTranspose()) {
+ auto trans = getTranspose().value();
+ if (tdescShape.size() >= trans.size())
+ transpose(trans, tdescShape);
+ else
+ emitWarning("Invalid transpose attr. It is ignored.");
+ }
+
+ if (getVnniAxis()) {
+ auto axis = getVnniAxis().value();
+ auto vnni_factor = valueShape.back();
+ tdescShape[axis] /= vnni_factor;
+ tdescShape.push_back(vnni_factor);
+ }
+
+ if (array_len > 1) {
+ auto it = tdescShape.begin();
+ tdescShape.insert(it, array_len);
+ }
+
+ if (tdescShape != valueShape)
+ return emitOpError("Result shape doesn't match TensorDesc shape.")
+ << "\nThe expected shape is " << makeString(tdescShape) << "."
+ << "\nBut the given shape is " << makeString(valueShape) << "."
+ << "\nIn VC mode, when VNNI is not enabled, the result should have "
+ << "the same shape (or transposed shape if transpose is enabled) "
+ << "as TensorDesc; \nwhen VNNI is enabled, the result should have "
+ << "one more dimention than the TensorDesc, with last dimention "
+ << "having vnni factor, \nbut having same number of total data "
+ << "elements. The vnni factor are typically calculated as "
+ << "simd_lane_width / elementTypeBitWidth. \nFor element type "
+ << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT "
+ << "mode, the shape is derived from the mapping attributes.\n";
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNDOp
+//===----------------------------------------------------------------------===//
+ParseResult StoreNDOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+ llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+ // parse value
+ if (parser.parseOperand(Operands[0]))
+ return failure();
+
+ if (parser.parseComma())
+ return failure();
+
+ // parse TensorDesc
+ if (parser.parseOperand(Operands[1]))
+ return failure();
+
+ // parse optional attributes
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ llvm::SmallVector<Type> Types;
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+
+ return success();
+}
+
+void StoreNDOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getValue();
+ printer << ",";
+ printer << ' ';
+ printer << getTensorDesc();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getValue().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+}
+
+LogicalResult StoreNDOp::verify() {
+ auto dstTy = getTensorDesc().getType(); // Tile
+ auto valTy = llvm::dyn_cast<VectorType>(getValue().getType()); // Vector
+
+ if (dstTy.getRank() != 2)
+ return emitOpError(
+ "The TensorDesc for StoreNdOp should be a 2D TensorDesc.");
+
+ if (!valTy)
+ return emitOpError("Invalid value operand, it should be a VectorType.\n");
+
+ auto dstElemTy = dstTy.getElementType();
+ auto valElemTy = valTy.getElementType();
+
+ if (dstElemTy != valElemTy) {
+ return emitOpError("The elem type of value (vector) shape doesn't match "
+ "the elem type of memory (dst) shape.\n");
+ }
+
+ auto mode = getMode();
+
+ if (mode == ModeKind::VC) { // for VC mode, no attr attached
+ if (dstTy.getShape() != valTy.getShape())
+ return emitOpError("In VC mode, the value (vector) shape doesn't match "
+ "the memory (dst) shape.\n");
+ } else {
+ auto mapping = dstTy.getMapping();
+ if (!mapping) {
+ return emitOpError(
+ "Expecting SgMap attribute for SIMT mode operators.\n");
+ }
+
+ SubGroupMapAttr sgMap;
+ std::vector<int64_t> shape = dstTy.getShape().vec();
+
+ sgMap = llvm::dyn_cast<SubGroupMapAttr>(mapping);
+
+ if (!verifyAndInferShape(shape, sgMap)) {
+ return emitOpError("Failed to infer the shape.")
+ << "The new shape[i] should meet the following condistions "
+ "for SubGroupMapAttr: "
+ << "\n\ttdescShape[i] % mma_block_size[i] == 0 (if it has) && "
+ << "\n\ttdescShape[i] % wi_layout[i] == 0 && "
+ << "\n\ttdescShape[i] % wi_data[i] == 0 && "
+ << "\n\t(tdescShape[i] % (wi_layout[i] * wi_data[i]) == 0 || "
+ << "\n\t (wi_layout[i] * wi_data[i]) % tdescShape[i] == 0).\n";
+ }
+
+ if (shape != valTy.getShape().vec())
+ return emitOpError(
+ "In SIMT mode, the value (vector) shape doesn't match the memory"
+ "(dst) shape as derived according to the mapping rule.\n");
+ }
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchNDOp
+//===----------------------------------------------------------------------===//
+ParseResult PrefetchNDOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+ llvm::SmallVector<Type> TensorDescTypes(1);
+ llvm::SMLoc TensorDescOperandsLoc;
+
+ TensorDescOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(TensorDescOperands[0]))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseType(TensorDescTypes[0]))
+ return failure();
+ if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
+ TensorDescOperandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void PrefetchNDOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateNDOffsetOp
+//===----------------------------------------------------------------------===//
+ParseResult UpdateNDOffsetOp::parse(OpAsmParser &parser,
+ OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand, 4> offsetsOperands;
+ llvm::SmallVector<Type> TensorDescTypes(1);
+ llvm::SmallVector<Type> resultTypes(1);
+ llvm::SMLoc TensorDescOperandsLoc;
+ llvm::SMLoc offsetsOperandsLoc;
+
+ TensorDescOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(TensorDescOperands[0]))
+ return failure();
+ if (parser.parseComma())
+ return failure();
+
+ // parse offsets, e.g., [x, y]
+ if (succeeded(parser.parseOptionalLSquare())) {
+ offsetsOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(offsetsOperands))
+ return failure();
+ if (parser.parseRSquare())
+ return failure();
+ }
+
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseType(TensorDescTypes[0]))
+ return failure();
+ if (parser.parseArrow())
+ return failure();
+
+ if (parser.parseType(resultTypes[0]))
+ return failure();
+ result.addTypes(resultTypes);
+ if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
+ TensorDescOperandsLoc, result.operands))
+ return failure();
+
+ Type indexType = parser.getBuilder().getIndexType();
+ if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc,
+ result.operands))
+ return failure();
+ return success();
+}
+
+void UpdateNDOffsetOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+ printer << ",";
+ if (!getOffsets().empty()) {
+ printer << ' ' << "[";
+ printer << getOffsets();
+ printer << "]";
+ }
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getResult().getType();
+}
+
+LogicalResult UpdateNDOffsetOp::verify() {
+ // number of offsets specified must match the rank of the tensor descriptor
+ if (getTensorDesc().getType().getRank() != (int64_t)getOffsets().size()) {
+ return emitOpError("Invalid number of offsets.");
+ }
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateDescOp
+//===----------------------------------------------------------------------===//
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+ TensorDescType TensorDesc, Value source, Value offsets,
+ uint32_t chunk_size_per_lane) {
+ state.addOperands(source);
+ state.addOperands(offsets);
+ state.getOrAddProperties<Properties>().chunk_size_per_lane =
+ builder.getIntegerAttr(builder.getIntegerType(32), chunk_size_per_lane);
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(TensorDesc);
+}
+
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+ TensorDescType TensorDesc, Value source, Value offsets,
+ IntegerAttr chunk_size_per_lane) {
+ state.addOperands(source);
+ state.addOperands(offsets);
+ if (chunk_size_per_lane)
+ state.getOrAddProperties<Properties>().chunk_size_per_lane =
+ chunk_size_per_lane;
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(TensorDesc);
+}
+
+ParseResult CreateDescOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+ llvm::SmallVector<Type> Types(2);
+ llvm::SMLoc operandsLoc = parser.getCurrentLocation();
+ // parse the source operand
+ if (parser.parseOperand(Operands[0]))
+ return failure();
+
+ if (parser.parseComma())
+ return failure();
+
+ // parse the offset operand
+ if (parser.parseOperand(Operands[1]))
+ return failure();
+
+ // parse the optional attributes
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseType(Types[0]))
+ return failure();
+ if (parser.parseComma())
+ return failure();
+
+ if (parser.parseType(Types[1]))
+ return failure();
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> TensorDescTypes(1);
+ if (parser.parseType(TensorDescTypes[0]))
+ return failure();
+
+ result.addTypes(TensorDescTypes);
+ if (parser.resolveOperands(Operands, Types, operandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void CreateDescOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto chunk = getChunkSizePerLane();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getSource();
+ printer << ",";
+ printer << ' ';
+ printer << getOffsets();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults) {
+ if (mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ if (chunk == 1)
+ elidedAttrs.push_back("chunk_size_per_lane");
+ }
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getSource().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getOffsets().getType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+}
+
+LogicalResult CreateDescOp::verify() {
+ auto mode = getMode();
+ auto mapping = getTensorDesc().getType().getMapping();
+ auto offsetTy = getOffsets().getType();
+ auto tdescTy = getTensorDesc().getType();
+ auto chunkSize = getChunkSizePerLane();
+
+ if (mode == ModeKind::SIMT || mapping) {
+ return emitOpError("CreateDescOp only support VC mode and mapping "
+ "attribute of TensorDesc is not expected.\n");
+ }
+
+ if (getRankOf(getSource()) > 2)
+ return emitOpError(
+ "Expecting the source is a 1D/2D memref or pointer (uint64_t).");
+
+ if (!tdescTy.getScattered())
+ return emitOpError(
+ "Expecting the presence of ScatteredAttr for tensor descriptor.");
+
+ // Infer the TensorDesc shape
+ std::vector<int64_t> shape;
+ if (llvm::isa<VectorType>(offsetTy)) {
+ shape = llvm::dyn_cast<VectorType>(offsetTy).getShape().vec();
+ if (shape.size() != 1)
+ return emitOpError("Expecting the offset is a 1D vector.");
+ }
+
+ if (chunkSize != 1) {
+ shape.push_back(chunkSize);
+ }
+
+ auto tdescShape = tdescTy.getShape();
+ if (shape != tdescShape.vec()) {
+ return emitOpError("Expecting dimensions of offsets is the same as the "
+ "tensor descriptor, or one less than.");
+ }
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadGatherOp
+//===----------------------------------------------------------------------===//
+void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
+ Value TensorDesc, Value mask, IntegerAttr vnni_axis,
+ DenseI64ArrayAttr transpose, CacheKindAttr l1_hint,
+ CacheKindAttr l2_hint, CacheKindAttr l3_hint) {
+ state.addOperands(TensorDesc);
+ state.addOperands(mask);
+ if (vnni_axis)
+ state.getOrAddProperties<Properties>().vnni_axis = vnni_axis;
+
+ if (transpose)
+ state.getOrAddProperties<Properties>().transpose = transpose;
+
+ if (l1_hint)
+ state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+
+ if (l2_hint)
+ state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+
+ if (l3_hint)
+ state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(value);
+}
+
+void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type value,
+ Value TensorDesc, Value mask, IntegerAttr vnni_axis,
+ DenseI64ArrayAttr transpose, CacheKind l1_hint,
+ CacheKind l2_hint, CacheKind l3_hint) {
+ state.addOperands(TensorDesc);
+ state.addOperands(mask);
+ if (vnni_axis)
+ state.getOrAddProperties<Properties>().vnni_axis = vnni_axis;
+
+ if (transpose)
+ state.getOrAddProperties<Properties>().transpose = transpose;
+
+ state.getOrAddProperties<Properties>().l1_hint =
+ CacheKindAttr::get(builder.getContext(), l1_hint);
+ state.getOrAddProperties<Properties>().l2_hint =
+ CacheKindAttr::get(builder.getContext(), l2_hint);
+ state.getOrAddProperties<Properties>().l3_hint =
+ CacheKindAttr::get(builder.getContext(), l3_hint);
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(value);
+}
+
+ParseResult LoadGatherOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands(2);
+ llvm::SmallVector<Type> Types(2);
+ llvm::SmallVector<Type> valueTypes(1);
+ llvm::SMLoc OperandsLoc;
+
+ OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(Operands[0]))
+ return failure();
+
+ if (parser.parseComma())
+ return failure();
+
+ if (parser.parseOperand(Operands[1]))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseType(Types[0]))
+ return failure();
+
+ if (parser.parseComma())
+ return failure();
+
+ if (parser.parseType(Types[1]))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ if (parser.parseType(valueTypes[0]))
+ return failure();
+
+ result.addTypes(valueTypes);
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+
+ return success();
+}
+
+void LoadGatherOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+ printer << ",";
+ printer << ' ';
+ printer << getMask();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getMask().getType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getValue().getType();
+}
+
+LogicalResult LoadGatherOp::verify() {
+ auto tdescTy = getTensorDesc().getType();
+ auto maskTy = getMask().getType();
+ auto valueTy = getValue().getType();
+
+ if (!tdescTy.getScattered())
+ return emitOpError(
+ "LoadGatherOp only works on TensorDesc with ScatteredAttr.");
+
+ auto getElementType = [&](Type type) -> Type {
+ if (type.isIntOrIndexOrFloat())
+ return type;
+ else if (llvm::isa<VectorType>(type))
+ return llvm::dyn_cast<VectorType>(type).getElementType();
+ else if (llvm::isa<TensorDescType>(type))
+ return llvm::dyn_cast<TensorDescType>(type).getElementType();
+ llvm_unreachable("Unsupported type.");
+ return type;
+ };
+
+ auto tdescElemTy = getElementType(tdescTy);
+ auto valueElemTy = getElementType(valueTy);
+ if (tdescElemTy != valueElemTy)
+ return emitOpError(
+ "Value should have the same element type as TensorDesc.");
+
+ auto getShape = [&](Type type) -> std::vector<int64_t> {
+ std::vector<int64_t> shape;
+ if (type.isIntOrIndexOrFloat())
+ shape.push_back(1);
+ else if (llvm::isa<VectorType>(type))
+ shape = llvm::dyn_cast<VectorType>(type).getShape().vec();
+ else
+ llvm_unreachable("Unsupported type.");
+ return shape;
+ };
+
+ std::vector<int64_t> maskShape = getShape(maskTy);
+ std::vector<int64_t> valueShape = getShape(valueTy);
+ std::vector<int64_t> tdescShape = tdescTy.getShape().vec();
+
+ if (tdescShape != maskShape)
+ return emitOpError("Mask should have the same shape as TensorDesc.");
+
+ auto mode = getMode();
+ auto mapping = tdescTy.getMapping();
+ if (mode == ModeKind::SIMT || mapping) {
+ return emitOpError("LoadGatherOp only supports VC mode and mapping "
+ "attribute of TensorDesc is not expected.\n");
+ }
+
+ if (getTransposeAttr()) {
+ auto trans = getTranspose().value();
+ if (tdescShape.size() < trans.size())
+ return emitWarning("Invalid transpose attr. It is ignored.");
+ transpose(trans, tdescShape);
+ }
+
+ if (getVnniAxis()) {
+ auto axis = getVnniAxis().value();
+ auto vnni_factor = valueShape.back();
+ tdescShape[axis] /= vnni_factor;
+ tdescShape.push_back(vnni_factor);
+ }
+
+ if (valueShape != tdescShape)
+ return emitOpError(
+ "Result shape doesn't match TensorDesc shape. when VNNI is not enabled,"
+ "the result should have the same shape (or transposed shape if "
+ "transpose is also enabled) as TensorDesc. When VNNI is enabled, "
+ "the result should have one more dimention than the TensorDesc, "
+ "with last dimention having vnni factor, but having same number of"
+ "total data elements. The vnni factor are typically calculated as "
+ "simd_lane_width/elementTypeBitWidth. For element type having "
+ "more than 32 bits, vnni shouldn't be used.\n");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreScatterOp
+//===----------------------------------------------------------------------===//
+void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
+ Value value, Value TensorDesc, Value mask,
+ CacheKindAttr l1_hint, CacheKindAttr l2_hint,
+ CacheKindAttr l3_hint) {
+ state.addOperands(value);
+ state.addOperands(TensorDesc);
+ state.addOperands(mask);
+ if (l1_hint)
+ state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+ if (l2_hint)
+ state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+ if (l3_hint)
+ state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
+ Value value, Value TensorDesc, Value mask,
+ CacheKind l1_hint, CacheKind l2_hint,
+ CacheKind l3_hint) {
+ state.addOperands(value);
+ state.addOperands(TensorDesc);
+ state.addOperands(mask);
+ state.getOrAddProperties<Properties>().l1_hint =
+ CacheKindAttr::get(builder.getContext(), l1_hint);
+ state.getOrAddProperties<Properties>().l2_hint =
+ CacheKindAttr::get(builder.getContext(), l2_hint);
+ ;
+ state.getOrAddProperties<Properties>().l3_hint =
+ CacheKindAttr::get(builder.getContext(), l3_hint);
+ ;
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+ParseResult StoreScatterOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+ llvm::SmallVector<Type> Types;
+ llvm::SMLoc OperandsLoc;
+
+ OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(Operands))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+
+ return success();
+}
+
+void StoreScatterOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getValue();
+ printer << ",";
+ printer << ' ';
+ printer << getTensorDesc();
+ printer << ",";
+ printer << ' ';
+ printer << getMask();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getValue().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getMask().getType();
+}
+
+LogicalResult StoreScatterOp::verify() {
+ auto tdescTy = getTensorDesc().getType();
+ auto valueTy = getValue().getType();
+ auto maskTy = getMask().getType();
+ auto mode = getMode();
+ auto mapping = tdescTy.getMapping();
+
+ if (mode != ModeKind::VC || mapping)
+ return emitOpError("StoreScatterOp only supports VC mode and mapping "
+ "attribute of TensorDesc is not expected.\n");
+
+ if (!tdescTy.getScattered())
+ return emitOpError("Invalid TensorDesc. StoreScatterOp only works on "
+ "TensorDescs with ScatteredAttr.");
+
+ auto getShape = [&](Type type) -> std::vector<int64_t> {
+ std::vector<int64_t> shape;
+ if (type.isIntOrIndexOrFloat())
+ shape.push_back(1);
+ else if (llvm::isa<VectorType>(type))
+ shape = llvm::dyn_cast<VectorType>(type).getShape().vec();
+ else
+ llvm_unreachable("Unsupported type.");
+ return shape;
+ };
+
+ std::vector<int64_t> maskShape = getShape(maskTy);
+ std::vector<int64_t> valueShape = getShape(valueTy);
+ std::vector<int64_t> tdescShape = tdescTy.getShape().vec();
+
+ if (valueShape != maskShape) {
+ return emitOpError("Mask and value should have the same shape/size");
+ }
+
+ if (tdescShape != valueShape) {
+ return emitOpError("TensorDesc shape and value shape doesn't match. ")
+ << "The expected/derived value shape is: " << makeString(tdescShape)
+ << ".\nMask and value should have the same shape/size as "
+ "TensorDesc.\n";
+ }
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchOp
+//===----------------------------------------------------------------------===//
+void PrefetchOp::build(OpBuilder &builder, OperationState &state,
+ Value TensorDesc, CacheKindAttr l1_hint,
+ CacheKindAttr l2_hint, CacheKindAttr l3_hint) {
+ state.addOperands(TensorDesc);
+ if (l1_hint)
+ state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+
+ if (l2_hint)
+ state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+
+ if (l3_hint)
+ state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+void PrefetchOp::build(OpBuilder &builder, OperationState &state,
+ Value TensorDesc, CacheKind l1_hint, CacheKind l2_hint,
+ CacheKind l3_hint) {
+ state.addOperands(TensorDesc);
+ state.getOrAddProperties<Properties>().l1_hint =
+ CacheKindAttr::get(builder.getContext(), l1_hint);
+ state.getOrAddProperties<Properties>().l2_hint =
+ CacheKindAttr::get(builder.getContext(), l2_hint);
+ state.getOrAddProperties<Properties>().l3_hint =
+ CacheKindAttr::get(builder.getContext(), l3_hint);
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+}
+
+ParseResult PrefetchOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> TensorDescOperands(1);
+ llvm::SmallVector<Type> TensorDescTypes(1);
+ llvm::SMLoc TensorDescOperandsLoc;
+
+ TensorDescOperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperand(TensorDescOperands[0]))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseType(TensorDescTypes[0]))
+ return failure();
+
+ if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
+ TensorDescOperandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void PrefetchOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+}
+
+LogicalResult PrefetchOp::verify() {
+ auto mode = getMode();
+ auto tdescTy = getTensorDesc().getType();
+ auto mapping = tdescTy.getMapping();
+
+ auto isValidHint = [&](CacheKindAttr attr) -> bool {
+ if (!attr)
+ return true;
+ auto kind = attr.getValue();
+ return kind == CacheKind::CACHED || kind == CacheKind::UNCACHED ||
+ kind == CacheKind::STREAMING || kind == CacheKind::READ_INVALIDATE;
+ };
+
+ if (!isValidHint(getL1HintAttr()))
+ return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+ if (!isValidHint(getL2HintAttr()))
+ return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+ if (!isValidHint(getL3HintAttr()))
+ return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+ if (!tdescTy.getScattered())
+ return emitOpError("Invalid TensorDesc. PrefetchOp only works on "
+ "TensorDescs with ScatteredAttr.");
+
+ if (mode != ModeKind::VC || mapping) {
+ return emitOpError("PrefetchOp only supports VC mode, and mapping "
+ "attribute of TensorDesc is not expected.\n");
+ }
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateOffsetOp
+//===----------------------------------------------------------------------===//
+void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
+ Type result, Value TensorDesc, Value offsets) {
+ state.addOperands(TensorDesc);
+ state.addOperands(offsets);
+ state.getOrAddProperties<Properties>().mode =
+ xegpu::ModeKindAttr::get(builder.getContext(), xegpu::ModeKind::VC);
+ state.addTypes(result);
+}
+
+ParseResult UpdateOffsetOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+ llvm::SmallVector<Type> Types;
+
+ auto OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(Operands))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> resultTypes(1);
+ if (parser.parseType(resultTypes[0]))
+ return failure();
+ result.addTypes(resultTypes);
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void UpdateOffsetOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getTensorDesc();
+ printer << ",";
+ printer << ' ';
+ printer << getOffsets();
+
+ llvm::SmallVector<llvm::StringRef> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getTensorDesc().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getOffsets().getType();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getResult().getType();
+}
+
+LogicalResult UpdateOffsetOp::verify() {
+ auto mode = getMode();
+ if (mode != ModeKind::VC)
+ return emitOpError("UpdateOffsetOp only work on VC mode.\n");
+
+ auto srcTy = getTensorDesc().getType();
+ auto resTy = getResult().getType();
+ if (srcTy != resTy)
+ return emitOpError("The result should have the same type (shape and "
+ "encoding attribute) as the input TensorDesc.");
+
+ if (!srcTy.getScattered()) {
+ return emitOpError("Invalid TensorDesc. UpdateOffsetOp only works on "
+ "TensorDescs with ScatteredAttr.");
+ }
+
+ auto offTy = llvm::dyn_cast<VectorType>(getOffsets().getType());
+ if (!offTy || offTy.getRank() != 1)
+ return emitOpError("The offset should be an 1D vector.\n");
+
+ auto shape = srcTy.getShape();
+ if (shape[0] != offTy.getShape()[0])
+ return emitOpError(
+ "The offset should have same length as the dim-0 of TensorDesc.");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_DpasOp
+//===----------------------------------------------------------------------===//
+ParseResult DpasOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+ llvm::SmallVector<Type> Types;
+
+ llvm::SMLoc OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(Operands))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> resultTypes(1);
+ if (parser.parseType(resultTypes[0]))
+ return failure();
+ result.addTypes(resultTypes);
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+
+ return success();
+}
+
+void DpasOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer << ' ';
+ printer << getLhs();
+ printer << ",";
+ printer << ' ';
+ printer << getRhs();
+ if (Value value = getAcc())
+ printer << ", " << value;
+
+ llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getLhs().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getRhs().getType();
+ if (getAcc()) {
+ printer << ",";
+ printer << ' ';
+ printer << llvm::ArrayRef<Type>(getAcc().getType());
+ }
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getResult().getType();
+}
+
+LogicalResult DpasOp::verify() {
+ int64_t lhsRank = getLhsType().getRank();
+ int64_t rhsRank = getRhsType().getRank();
+ Type lhsElemType = getLhsType().getElementType();
+ Type rhsElemType = getRhsType().getElementType();
+
+ if (lhsElemType != rhsElemType)
+ return emitOpError("lhs and rhs element type does not match for dpas op");
+
+ if (getAcc() && getAccType() != getResultType())
+ return emitOpError("Accumulator and Result for dpas op should have the "
+ "same type (both shape and element type).");
+
+ if (lhsRank != rhsRank || lhsRank != 3)
+ return emitOpError(
+ "lhs and rhs rank does not match for dpas op, or their rank is not 3.");
+
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_InvokeSIMDOp
+//===----------------------------------------------------------------------===//
+void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
+ SymbolRefAttr callee, TypeRange results,
+ ArgTypeKindAttr argType, ValueRange operands) {
+ state.addOperands(operands);
+ state.addAttribute("argType", argType);
+ state.addAttribute("callee", callee);
+ state.addTypes(results);
+}
+
+void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
+ StringAttr callee, TypeRange results,
+ ArgTypeKindAttr argType, ValueRange operands) {
+ build(builder, state, SymbolRefAttr::get(callee), results, argType, operands);
+}
+
+void InvokeSIMDOp::build(OpBuilder &builder, OperationState &state,
+ llvm::StringRef callee, TypeRange results,
+ ArgTypeKindAttr argType, ValueRange operands) {
+ build(builder, state, StringAttr::get(builder.getContext(), callee), results,
+ argType, operands);
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_AtomicRMWOp
+//===----------------------------------------------------------------------===//
+void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result,
+ AtomicRMWKindAttr kind, Value tensorDesc, Value mask,
+ Value value) {
+ state.addOperands(tensorDesc);
+ state.addOperands(mask);
+ if (value)
+ state.addOperands(value);
+ state.getOrAddProperties<Properties>().kind = kind;
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(result);
+}
+
+void AtomicRMWOp::build(OpBuilder &builder, OperationState &state, Type result,
+ AtomicRMWKind kind, Value tensorDesc, Value mask,
+ Value value) {
+ state.addOperands(tensorDesc);
+ state.addOperands(mask);
+ if (value)
+ state.addOperands(value);
+ state.getOrAddProperties<Properties>().kind =
+ AtomicRMWKindAttr::get(builder.getContext(), kind);
+ state.getOrAddProperties<Properties>().mode =
+ ModeKindAttr::get(builder.getContext(), ModeKind::VC);
+ state.addTypes(result);
+}
+
+ParseResult AtomicRMWOp::parse(OpAsmParser &parser, OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand> Operands;
+ llvm::SmallVector<Type, 1> Types;
+ llvm::SMLoc OperandsLoc;
+
+ llvm::SmallVector<Type> resultTypes(1);
+
+ xegpu::AtomicRMWKindAttr kindAttr;
+ if (parser.parseCustomAttributeWithFallback(kindAttr, Type{}))
+ return failure();
+ if (kindAttr)
+ result.getOrAddProperties<AtomicRMWOp::Properties>().kind = kindAttr;
+
+ OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(Operands))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ if (parser.parseCustomTypeWithFallback(resultTypes[0]))
+ return failure();
+ result.addTypes(resultTypes);
+
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void AtomicRMWOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+
+ printer.printStrippedAttrOrType(getKindAttr());
+ printer << ' ';
+ printer << getTensorDesc();
+ printer << ",";
+ printer << ' ';
+ printer << getMask();
+ if (Value value = getValue())
+ printer << ", " << value;
+
+ llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+ elidedAttrs.push_back("kind");
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+ printer << ' ' << ":";
+ printer << ' ';
+ printer << getOperation()->getOperandTypes();
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getResult().getType();
+}
+
+LogicalResult AtomicRMWOp::verify() {
+ auto mode = getMode();
+ if (mode != ModeKind::VC)
+ return emitOpError("AtomicRMWOp only work on VC mode.\n");
+ return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNbarrierOp
+//===----------------------------------------------------------------------===//
+ParseResult CreateNbarrierOp::parse(OpAsmParser &parser,
+ OperationState &result) {
+ llvm::SmallVector<OpAsmParser::UnresolvedOperand, 2> Operands;
+ llvm::SmallVector<Type> Types;
+ llvm::SMLoc OperandsLoc;
+
+ OperandsLoc = parser.getCurrentLocation();
+ if (parser.parseOperandList(Operands))
+ return failure();
+
+ auto loc = parser.getCurrentLocation();
+ if (parseOptionalAttrDictWithCustomAttrs(parser, result))
+ return failure();
+
+ if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() {
+ return parser.emitError(loc)
+ << "'" << result.name.getStringRef() << "' op ";
+ })))
+ return failure();
+
+ if (parser.parseColon())
+ return failure();
+
+ if (parser.parseLParen())
+ return failure();
+
+ if (parser.parseTypeList(Types))
+ return failure();
+
+ if (parser.parseRParen())
+ return failure();
+
+ if (parser.parseArrow())
+ return failure();
+
+ llvm::SmallVector<Type> resultTypes(1);
+ if (parser.parseType(resultTypes[0]))
+ return failure();
+
+ result.addTypes(resultTypes);
+ if (parser.resolveOperands(Operands, Types, OperandsLoc, result.operands))
+ return failure();
+ return success();
+}
+
+void CreateNbarrierOp::print(OpAsmPrinter &printer) {
+ auto mode = getMode();
+ auto printDefaults = printDefaultValues();
+ llvm::SmallVector<llvm::StringRef, 2> elidedAttrs;
+ if (!printDefaults && mode == xegpu::ModeKind::SIMT)
+ elidedAttrs.push_back("mode");
+
+ printer << ' ';
+ printer << getNbarrierId();
+ printer << ",";
+ printer << ' ';
+ printer << getNbarrierRole();
+ printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+ printer << ' ' << ":";
+ printer << ' ' << "(";
+ printer << getNbarrierId().getType();
+ printer << ",";
+ printer << ' ';
+ printer << getNbarrierRole().getType();
+ printer << ")";
+ printer << ' ' << "->";
+ printer << ' ';
+ printer << getResult().getType();
+}
+
+} // namespace xegpu
+} // namespace mlir
+
+#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
+#define GET_OP_CLASSES
+#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
diff --git a/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
new file mode 100644
index 00000000000000..64a6f547fbd29d
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc({{.*}}) {
+func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %2 = xegpu.create_nd_tdesc %src[2, 4] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
+func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc {{.*}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ return
+}
+
+// CHECK-LABEL: func @test_load_nd_vc({{.*}}) {
+func.func @test_load_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+ // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ return
+}
+
+// CHECK-LABEL: func @test_store_nd_vc({{.*}}) {
+func.func @test_store_nd_vc(%src: memref<24x32xf16>, %dst: memref<24x32xf16>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+ // CHECK: xegpu.load_nd {{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1 {mode=vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+
+ // CHECK: xegpu.store_nd {{%[0-9], %[0-9]}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ return
+}
+
+// CHECK-LABEL: func @test_dpas_vc({{.*}}) {
+func.func @test_dpas_vc(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
+ // CHECK: xegpu.dpas {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ %1 = xegpu.dpas %a, %b {mode = vc}: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_update_nd_offset_vc({{.*}}) {
+func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.load_nd {{%[0-9]}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+ // CHECK: xegpu.update_nd_offset {{%[0-9]}}, [{{%c[0-9], %c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc}: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_prefetch_nd_vc({{.*}}) {
+func.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc {{.*}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: xegpu.prefetch_nd {{%[0-9]}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
new file mode 100644
index 00000000000000..f80df161a543ac
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
+func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {mode=vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32>
+ xegpu.atomic_rmw #xegpu<atomic_rmw_kind addf> %1, %mask, %value {mode=vc}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
+func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+ xegpu.atomic_rmw mulf %1, %mask, %value {mode=vc}
+ : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
+func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+ xegpu.atomic_rmw andi %1, %mask, %value {mode=vc}
+ : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
new file mode 100644
index 00000000000000..0f7229a02aa180
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
+func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x1xf32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32>
+ xegpu.atomic_rmw addf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
+func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32>
+ xegpu.atomic_rmw mulf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
+func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
+
+ // CHECK: xegpu.atomic_rmw
+ // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32>
+ xegpu.atomic_rmw andi %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/barrier_ops.mlir b/mlir/test/Dialect/XeGPU/IR/barrier_ops.mlir
new file mode 100644
index 00000000000000..a1abc9e171bcaf
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/barrier_ops.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @alloc_nbarrier({{.*}}) {
+func.func @alloc_nbarrier() {
+ // CHECK: xegpu.alloc_nbarrier
+ xegpu.alloc_nbarrier 8
+ return
+}
+
+// CHECK-LABEL: func @create_nbarrier({{.*}}) {
+func.func @create_nbarrier() {
+ %nbarrier_id = arith.constant 1 : i8
+ %nbarrier_role = arith.constant 0 : i8
+ // CHECK: xegpu.create_nbarrier
+ // CHECK-SAME: {num_consumers = 32 : i8, num_producers = 32 : i8}
+ // CHECK-SAME: (i8, i8) -> !xegpu.nbarrier
+ %nbarrier = xegpu.create_nbarrier %nbarrier_id, %nbarrier_role {num_producers = 32 :i8 , num_consumers = 32 : i8}
+ : (i8, i8) -> !xegpu.nbarrier
+ return
+}
+
+// CHECK-LABEL: func @nbarrier_arrive({{.*}}) {
+func.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) {
+ // CHECK: xegpu.nbarrier_arrive
+ // CHECK-SAME: !xegpu.nbarrier
+ xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
+ return
+}
+
+// CHECK-LABEL: func @nbarrier_wait({{.*}}) {
+func.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) {
+ // CHECK: xegpu.nbarrier_wait
+ // CHECK-SAME: !xegpu.nbarrier
+ xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
+ return
+}
+
+// CHECK-LABEL: func @compile_hint({{.*}}) {
+func.func @compile_hint() {
+ // CHECK: xegpu.compile_hint
+ xegpu.compile_hint
+ return
+}
+
+// CHECK-LABEL: func @mfence({{.*}}) {
+func.func @mfence() {
+ // CHECK: xegpu.mfence {fence_op = "none", fence_scope = "local", memory_kind = "ugm"}
+ xegpu.mfence {memory_kind = "ugm" , fence_op = "none", fence_scope = "local"}
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
new file mode 100644
index 00000000000000..cebf59f12939da
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -0,0 +1,111 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+#sg_map_fp16 = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
+
+func.func @test_create_nd_tdesc_0(%src: memref<24x32xf16>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %2 = xegpu.create_nd_tdesc %src[2, 4]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_1({{.*}}) {
+func.func @test_create_nd_tdesc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_2({{.*}}) {
+func.func @test_create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_3({{.*}}) {
+func.func @test_create_nd_tdesc_3(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_nd_tdesc_4({{.*}}) {
+func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
+ : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_5({{.*}}) {
+func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
+ : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_6({{.*}}) {
+func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
+ : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_7({{.*}}) {
+func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %src[%offset] : memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #sg_map_fp16>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_nd_tdesc_8({{.*}}) {
+func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+ %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
+ : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) {
+func.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+ %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref<?x?xf16>
+ -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
new file mode 100644
index 00000000000000..a21bf792fe0792
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
@@ -0,0 +1,115 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// ----- SIMD -----
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_0({{.*}}) {
+func.func @test_create_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %2 = xegpu.create_nd_tdesc %src[2, 4] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_1({{.*}}) {
+func.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg1, %arg2]
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_2({{.*}}) {
+func.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_3({{.*}}) {
+func.func @test_create_nd_tdesc_vc_3(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_4({{.*}}) {
+func.func @test_create_nd_tdesc_vc_4(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_5({{.*}}) {
+func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc}
+ : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_6({{.*}}) {
+func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc}
+ : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_7({{.*}}) {
+func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%offset] {mode = vc} : memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_8({{.*}}) {
+func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {mode = vc}
+ : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) {
+func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm, array_length = 2>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc} : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm, array_length = 2>>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir b/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
new file mode 100644
index 00000000000000..8fb5ac824ddb27
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
+func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
+ %1 = xegpu.create_tdesc %src, %offsets {mode=vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
new file mode 100644
index 00000000000000..245d862e302a7c
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+
+// CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
+func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ return
+}
+
+// CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) {
+func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index>
+ -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ return
+}
+
+// CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) {
+func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+ return
+}
+
+// CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) {
+func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ return
+}
+
+
+// CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) {
+func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x index>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
+ : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.tdesc_attr<memory_scope = slm, #xegpu.scattered>>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/invalid_vc.mlir b/mlir/test/Dialect/XeGPU/IR/invalid_vc.mlir
new file mode 100644
index 00000000000000..4a92fa77c5815e
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/invalid_vc.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -verify-diagnostics
+
+// -----
+func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // expected-error at +1 {{Expecting the rank of shape, strides, offsets and memref type should match with each other}}
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// -----
+func.func @test_create_nd_tdesc_vc_3(%input: memref<?xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+
+ // expected-error at +1 {{Expecting the rank of shape, strides, offsets and memref type should match with each other}}
+ %1 = xegpu.create_nd_tdesc %input[%c0, %c1], [%c8, %c16], [%c16, %c1] {mode = vc} : memref<?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+
+// -----
+func.func @test_create_nd_tdesc_vc_4(%input: memref<?x?xf32>) {
+ %c1 = arith.constant 2 : index
+ %c8 = arith.constant 8 : index
+
+ // expected-error at +1 {{Expecting the rank of shape, strides, offsets and memref type should match with each other}}
+ %1 = xegpu.create_nd_tdesc %input[%c1], [%c8], [%c1] {mode = vc}
+ : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+ return
+}
+
+// -----
+func.func @test_create_nd_tdesc_vc_5(%input: memref<24x32x64xf32>) {
+ %c1 = arith.constant 2 : index
+ %c8 = arith.constant 8 : index
+
+ // expected-error at +1 {{operand #0 must be 1D/2D memref}}
+ %1 = xegpu.create_nd_tdesc %input[%c1, %c1, %c8] {mode = vc}
+ : memref<24x32x64xf32> -> !xegpu.tensor_desc<8x16x8xf32>
+ return
+}
+
+// -----
+func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) {
+ // expected-error at +1 {{operand #1 must be vector of index values of ranks 1}}
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
+ : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+ return
+}
+
+// -----
+func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
+ %0 = arith.constant dense<1>: vector<16x8xi1>
+ // CHECK: xegpu.create_tdesc
+ // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
+ : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scattered>
+
+ // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ %2 = xegpu.load %1, %0 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16x8xf16, #xegpu.scattered>, vector<16x8xi1> -> vector<8x8x4xf16>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
new file mode 100644
index 00000000000000..a3cb890483e634
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+
+// CHECK-LABEL: func @test_load_gather_vc({{.*}}) {
+func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
+ %0 = arith.constant dense<1>: vector<16xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_load_gather_vc_2({{.*}}) {
+func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
+ %0 = arith.constant dense<1>: vector<16x8xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 8 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
+ : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, transpose = array<i64: 1, 0>}
+ // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
+ %2 = xegpu.load %1, %0 {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_load_gather_vc_3({{.*}}) {
+func.func @test_load_gather_vc_3(%src: ui64, %offsets : vector<16xindex>) {
+ %0 = arith.constant dense<1>: vector<16xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 1}
+ : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir
new file mode 100644
index 00000000000000..0644565c3f002e
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/load_nd.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+#sg_map_fp16_a = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_fp16_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_d = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
+// CHECK-LABEL: func @test_load_nd_fp16({{.*}}) {
+func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C : memref<24x32xf16>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xf16>
+ %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x2xf16>
+ %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> -> vector<8x1xf32>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %7 = xegpu.create_nd_tdesc %A[%c0, %c1]
+ : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d>
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xf16>
+ %8 = xegpu.load_nd %7 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> -> vector<4x1x2xf16>
+
+ return
+}
+
+#sg_map_bf16_a = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_bf16_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_bf16_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+// CHECK-LABEL: func @test_load_nd_bf16({{.*}}) {
+func.func @test_load_nd_bf16(%A: memref<24x32xbf16>, %B : memref<24x32xbf16>, %C : memref<24x32xbf16>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xbf16>
+ %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> -> vector<4x1x2xbf16>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x2xbf16>
+ %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> -> vector<8x1x2xbf16>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<24x32xbf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_bf16_c> -> vector<8x1xf32>
+
+ return
+}
+
+#sg_map_i8_a = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>
+#sg_map_i8_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_i8_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+// CHECK-LABEL: func @test_load_nd_i8({{.*}}) {
+func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : memref<64x64xi8>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>>
+ %1 = xegpu.create_nd_tdesc %A[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 4]>> -> vector<4x1x4xi8>
+ %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x1x4xi8>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
+ // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %3 = xegpu.create_nd_tdesc %B[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}} {vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x4xi8>
+ %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x1x4xi8>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}] : memref<64x64xi8>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %5 = xegpu.create_nd_tdesc %C[%c0, %c1] : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xi32>
+ %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> -> vector<8x1xi32>
+
+ return
+}
+
+#sg_map_f64_a = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>
+#sg_map_f64_b = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>
+#sg_map_f64_c = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>
+// CHECK-LABEL: func @test_load_nd_f64({{.*}}) {
+func.func @test_load_nd_f64(%A: memref<64x64xf64>, %B : memref<64x64xf64>, %C : memref<64x64xf64>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<64x64xf64>
+ // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
+ : memref<64x64xf64> -> !xegpu.tensor_desc<4x8xf64, #sg_map_f64_a>
+
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ // CHECK-SAME: -> vector<2x1xf64>
+ %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<4x8xf64, #sg_map_f64_a> -> vector<2x1xf64>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<64x64xf64>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
+ : memref<64x64xf64> -> !xegpu.tensor_desc<8x8xf64, #sg_map_f64_b>
+
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ // CHECK-SAME: -> vector<4x1xf64>
+ %4 = xegpu.load_nd %3 : !xegpu.tensor_desc<8x8xf64, #sg_map_f64_b> -> vector<4x1xf64>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<64x64xf64>
+ // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
+ : memref<64x64xf64> -> !xegpu.tensor_desc<4x8xf64, #sg_map_f64_c>
+
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 1]>>
+ // CHECK-SAME: -> vector<2x1xf64>
+ %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<4x8xf64, #sg_map_f64_c> -> vector<2x1xf64>
+
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
new file mode 100644
index 00000000000000..78980b551c0677
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/load_nd_vc.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// -- SIMD ---
+// CHECK-LABEL: func @test_load_nd_simd_f32({{.*}}) {
+func.func @test_load_nd_simd_f32(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]+}}, %{{c[0-9]+}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, l3_hint = #xegpu<cache_kind streaming>, mode = #xegpu<mode_kind vc>, transpose = array<i64: 1, 0>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ %3 = xegpu.load_nd %1 {mode= vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached, l3_hint=streaming} : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ return
+}
+
+// CHECK-LABEL: func @test_load_nd_simd_f16({{.*}}) {
+func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+ // CHECK: xegpu.load_nd %{{[0-9]+}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ %2 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ return
+}
+
+// CHECK-LABEL: func @test_load_nd_simd_bf16({{.*}}) {
+func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ %c1 = arith.constant 1 : index
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : ui64 -> !xegpu.tensor_desc<8x16xbf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : ui64 -> !xegpu.tensor_desc<8x16xbf16>
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>, vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
+ %2 = xegpu.load_nd %1 {mode=vc, vnni_axis = 1, l1_hint = cached, l2_hint = uncached} : !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
+
+ return
+}
+
+// CHECK-LABEL: func @test_load_nd_block_array_simd_f16({{.*}}) {
+func.func @test_load_nd_block_array_simd_f16(%src: memref<8x32xf16>) {
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[0, 0] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc}
+ : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
+ %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<array_length = 2>> -> vector<2x8x16xf16>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
new file mode 100644
index 00000000000000..6e2cb4de4ce1d4
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+// CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_0({{.*}}) {
+func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xf32>
+ xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xf32>
+
+ return
+}
+
+// CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_1({{.*}}) {
+func.func @test_prefetch_nd_tdesc_vc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+ // CHECK: xegpu.prefetch_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf16>
+ return
+}
+
+
+// CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_i8({{.*}}) {
+func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+
+ // CHECK: xegpu.prefetch_nd %{{[0-9]}} {mode = #xegpu<mode_kind vc>} : !xegpu.tensor_desc<8x16xi8>
+ xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<8x16xi8>
+
+ return
+}
+
+// CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_bf16({{.*}}) {
+func.func @test_prefetch_nd_tdesc_vc_bf16(%src: memref<24x32xbf16>, %x : index, %y : index) {
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
+ : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ // CHECK: xegpu.prefetch_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind uncached>, l2_hint = #xegpu<cache_kind cached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16>
+ xegpu.prefetch_nd %1 {mode = vc, l1_hint = uncached, l2_hint = cached}: !xegpu.tensor_desc<8x16xbf16>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir b/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
new file mode 100644
index 00000000000000..ff6f31c77064af
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// ---- BF16 ------
+
+#sg_map_fp16_a = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_fp16_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+// CHECK-LABEL: func @test_gemm_bf16({{.*}}) {
+func.func @test_gemm_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16>, %c: memref<1024x1024xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c1024 = arith.constant 1024 : index
+
+ %c0_1 = arith.constant 0 : i32
+ %c1_1 = arith.constant 1 : i32
+
+
+ scf.for %i= %c0 to %c1024 step %c8 {
+ scf.for %j= %c0 to %c1024 step %c16 {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xbf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %1 = xegpu.create_nd_tdesc %a[%i, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xbf16>
+ // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.create_nd_tdesc %b[%c0, %j] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
+
+ %3 = arith.constant dense<0.0> : vector<8x1xf32>
+
+ %tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16 iter_args(%subA = %1, %subB = %2, %subC = %3)
+ -> (!xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>, vector<8x1xf32>) {
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: vector<4x1x2xbf16>
+ %4 = xegpu.load_nd %subA {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a> -> vector<4x1x2xbf16>
+
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: vector<8x1x2xbf16>
+ %5 = xegpu.load_nd %subB {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b> -> vector<8x1x2xbf16>
+
+ // CHECK: xegpu.dpas
+ // CHECK-SAME: vector<4x1x2xbf16>, vector<8x1x2xbf16>, vector<8x1xf32> -> vector<8x1xf32>
+ %6 = xegpu.dpas %4, %5, %subC : vector<4x1x2xbf16>, vector<8x1x2xbf16>, vector<8x1xf32> -> vector<8x1xf32>
+
+ %7 = xegpu.update_nd_offset %subA, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
+ -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
+
+ %8 = xegpu.update_nd_offset %subB, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
+ -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
+
+ scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>, vector<8x1xf32>
+ }
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xf32>
+ %9 = xegpu.create_nd_tdesc %c[%i, %j] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+
+ // CHECK: xegpu.store_nd
+ // CHECK-SAME: vector<8x1xf32>
+ xegpu.store_nd %result, %9 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+ }
+ }
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir b/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
new file mode 100644
index 00000000000000..794a6b6f1afb9c
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// ---- BF16 VC ------
+
+// CHECK-LABEL: func @test_gemm_vc_bf16({{.*}}) {
+func.func @test_gemm_vc_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16>, %c: memref<1024x1024xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c1024 = arith.constant 1024 : index
+
+ %c0_1 = arith.constant 0 : i32
+ %c1_1 = arith.constant 1 : i32
+
+
+ scf.for %i= %c0 to %c1024 step %c8 {
+ scf.for %j= %c0 to %c1024 step %c16 {
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ %1 = xegpu.create_nd_tdesc %a[%i, %c0] {mode = vc} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+ %2 = xegpu.create_nd_tdesc %b[%c0, %j] {mode = vc} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+
+ %3 = arith.constant dense<0.0> : vector<8x16xf32>
+
+ %tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16
+ iter_args(%subA = %1, %subB = %2, %subC = %3)
+ -> (!xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>, vector<8x16xf32>) {
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
+ %4 = xegpu.load_nd %subA {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
+
+ // CHECK: xegpu.load_nd
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16> -> vector<8x16x2xbf16>
+ %5 = xegpu.load_nd %subB {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16> -> vector<8x16x2xbf16>
+
+ // CHECK: xegpu.dpas
+ // CHECK-SAME: vector<8x8x2xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %6 = xegpu.dpas %4, %5, %subC {mode = vc} : vector<8x8x2xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+ %7 = xegpu.update_nd_offset %subA, [%c0, %c16] {mode = vc} : !xegpu.tensor_desc<8x16xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+
+ %8 = xegpu.update_nd_offset %subB, [%c16, %c0] {mode = vc} : !xegpu.tensor_desc<16x16xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+
+ scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>, vector<8x16xf32>
+ }
+
+ // CHECK: xegpu.create_nd_tdesc
+ // CHECK-SAME: memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %9 = xegpu.create_nd_tdesc %c[%i, %j] {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.store_nd
+ // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %result, %9 {mode = vc}: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ }
+ }
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
new file mode 100644
index 00000000000000..170b3a9fe81474
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/store_nd_vc.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_store_nd_vc_bf16({{.*}}) {
+func.func @test_store_nd_vc_bf16(%src: memref<24x32xbf16>, %dst: memref<24x32xbf16>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc} : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
+ %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
+
+ // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16>
+ xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xbf16>, !xegpu.tensor_desc<8x16xbf16>
+ return
+}
+
+// CHECK-LABEL: func @test_store_nd_vc_f64({{.*}}) {
+func.func @test_store_nd_vc_f64(%src: memref<24x32xf64>, %dst: memref<24x32xf64>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc} : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
+ %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
+ : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64>
+ %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xf64> -> vector<8x16xf64>
+
+ // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64>
+ xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xf64>, !xegpu.tensor_desc<8x16xf64>
+ return
+}
+
+// CHECK-LABEL: func @test_store_nd_vc_i8({{.*}}) {
+func.func @test_store_nd_vc_i8(%src: memref<24x32xi8>, %dst: memref<24x32xi8>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+ %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
+ : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8>
+ %3 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}: !xegpu.tensor_desc<8x16xi8> -> vector<8x16xi8>
+
+ // CHECK: xegpu.store_nd %{{[0-9]}}, %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8>
+ xegpu.store_nd %3, %2 {mode = vc, l1_hint = write_back, l2_hint = uncached}: vector<8x16xi8>, !xegpu.tensor_desc<8x16xi8>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
new file mode 100644
index 00000000000000..6d98ac3950c31f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_store_scatter({{.*}}) {
+func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui64) {
+ %0 = arith.constant dense<true>: vector<16xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
+ : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
+ : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+ xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
new file mode 100644
index 00000000000000..c1a51712e70037
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_store_scatter_vc({{.*}}) {
+func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
+ %0 = arith.constant dense<1>: vector<16xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+ xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
new file mode 100644
index 00000000000000..1b97be77a2d79f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/update_nd_offset.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+// CHECK-LABEL: func @test_update_nd_offset_vc_0({{.*}}) {
+func.func @test_update_nd_offset_vc_0(%src: memref<24x32xf32>) {
+ %c0 = arith.constant 2 : index
+ %c1 = arith.constant 4 : index
+
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}]
+ // CHECK-SAME: {mode = #xegpu<mode_kind vc>} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
+ : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: xegpu.load_nd %{{[0-9]}}
+ // CHECK-SAME: {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+ // CHECK: xegpu.update_nd_offset %{{[0-9]}}, [%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %3 = xegpu.update_nd_offset %1, [%c0, %c1] {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ return
+}
diff --git a/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
new file mode 100644
index 00000000000000..05b0092d2379b7
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/IR/update_offset_vc.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: func @test_update_offset_VC({{.*}}) {
+func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
+ %0 = arith.constant dense<1>: vector<16xi1>
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
+ : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+ %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+
+ %3 = arith.constant dense<16>: vector<16 x index>
+ %4 = arith.addi %offsets, %3: vector<16 x index>
+
+ // CHECK: xegpu.update_offset %{{[0-9]}}, %{{[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+ %5 = xegpu.update_offset %1, %4 {mode = vc}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+
+ return
+}
>From 9cac285ed21833ac88773809816515156d7fcb89 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 18 Jan 2024 10:15:30 -0600
Subject: [PATCH 2/4] update testcases
---
mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir | 43 -------------------
mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir | 12 ++++--
.../Dialect/XeGPU/IR/create_nd_tdesc.mlir | 22 +++++-----
.../Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir | 31 ++++++-------
mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir | 11 -----
mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir | 32 +++++++-------
.../test/Dialect/XeGPU/IR/simple_gemm_vc.mlir | 18 +++++---
mlir/test/Dialect/XeGPU/IR/store_scatter.mlir | 29 -------------
8 files changed, 60 insertions(+), 138 deletions(-)
delete mode 100644 mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
delete mode 100644 mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
delete mode 100644 mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
deleted file mode 100644
index f80df161a543ac..00000000000000
--- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw.mlir
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: mlir-opt %s | FileCheck %s
-// Verify the printed output can be parsed.
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
-// Verify the generic form can be parsed.
-// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
-
-// CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
-func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) {
- %1 = xegpu.create_tdesc %src, %offsets {mode=vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-
- // CHECK: xegpu.atomic_rmw
- // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32>
- xegpu.atomic_rmw #xegpu<atomic_rmw_kind addf> %1, %mask, %value {mode=vc}
- : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
-
- return
-}
-
-// CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
-func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
- %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
- : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
-
- // CHECK: xegpu.atomic_rmw
- // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
- xegpu.atomic_rmw mulf %1, %mask, %value {mode=vc}
- : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
-
- return
-}
-
-// CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
-func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
- %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
- : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
-
- // CHECK: xegpu.atomic_rmw
- // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
- xegpu.atomic_rmw andi %1, %mask, %value {mode=vc}
- : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
-
- return
-}
diff --git a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
index 0f7229a02aa180..90df2a7c80ac5a 100644
--- a/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
@@ -6,9 +6,11 @@
// CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x1xf32>, %mask : vector<16xi1>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
%1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
- // CHECK: xegpu.atomic_rmw
+ // CHECK: xegpu.atomic_rmw addf %{{[0-9]}}, %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32>
xegpu.atomic_rmw addf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32>
@@ -17,9 +19,11 @@ func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : v
// CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
%1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
- // CHECK: xegpu.atomic_rmw
+ // CHECK: xegpu.atomic_rmw mulf %{{[0-9]}}, %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32>
xegpu.atomic_rmw mulf %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
@@ -28,9 +32,11 @@ func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value :
// CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
+ // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {chunk_size_per_lane = 2 : i64, mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
%1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
- // CHECK: xegpu.atomic_rmw
+ // CHECK: xegpu.atomic_rmw andi %{{[0-9]}}, %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32>
xegpu.atomic_rmw andi %1, %mask, %value {mode = vc} : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
diff --git a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
index cebf59f12939da..8284d730d4089c 100644
--- a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -10,12 +10,12 @@ func.func @test_create_nd_tdesc_0(%src: memref<24x32xf16>) {
%c0 = arith.constant 2 : index
%c1 = arith.constant 4 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%c0, %c1]
: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[2, 4]
// CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%2 = xegpu.create_nd_tdesc %src[2, 4]
: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
@@ -25,7 +25,7 @@ func.func @test_create_nd_tdesc_0(%src: memref<24x32xf16>) {
// CHECK-LABEL: func @test_create_nd_tdesc_1({{.*}}) {
func.func @test_create_nd_tdesc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}]
// CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%x, %y]
: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
@@ -35,7 +35,7 @@ func.func @test_create_nd_tdesc_1(%src: memref<24x32xf16>, %x : index, %y : inde
// CHECK-LABEL: func @test_create_nd_tdesc_2({{.*}}) {
func.func @test_create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
return
@@ -44,7 +44,7 @@ func.func @test_create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index
// CHECK-LABEL: func @test_create_nd_tdesc_3({{.*}}) {
func.func @test_create_nd_tdesc_3(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
return
@@ -54,7 +54,7 @@ func.func @test_create_nd_tdesc_3(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK-LABEL: func @test_create_nd_tdesc_4({{.*}}) {
func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
@@ -64,7 +64,7 @@ func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK-LABEL: func @test_create_nd_tdesc_5({{.*}}) {
func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
@@ -74,7 +74,7 @@ func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK-LABEL: func @test_create_nd_tdesc_6({{.*}}) {
func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
@@ -83,7 +83,7 @@ func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK-LABEL: func @test_create_nd_tdesc_7({{.*}}) {
func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}]
// CHECK-SAME: memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %src[%offset] : memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #sg_map_fp16>
return
@@ -93,7 +93,7 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
// CHECK-LABEL: func @test_create_nd_tdesc_8({{.*}}) {
func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[8, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %c1]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
%1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
@@ -103,7 +103,7 @@ func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) {
func.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[8, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %c1]
// CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
%1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref<?x?xf16>
-> !xegpu.tensor_desc<64x128xf16, #xegpu.tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
diff --git a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
index a21bf792fe0792..34cd66c9c69a4e 100644
--- a/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
@@ -10,12 +10,12 @@ func.func @test_create_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
%c0 = arith.constant 2 : index
%c1 = arith.constant 4 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
%1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[2, 4] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
%2 = xegpu.create_nd_tdesc %src[2, 4] {mode = vc}
: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -25,19 +25,16 @@ func.func @test_create_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
// CHECK-LABEL: func @test_create_nd_tdesc_vc_1({{.*}}) {
func.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>, %x : index, %y : index) {
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg1, %arg2]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
- : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
return
}
// CHECK-LABEL: func @test_create_nd_tdesc_vc_2({{.*}}) {
func.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf32>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : ui64 -> !xegpu.tensor_desc<8x16xf32>
return
@@ -46,8 +43,7 @@ func.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : in
// CHECK-LABEL: func @test_create_nd_tdesc_vc_3({{.*}}) {
func.func @test_create_nd_tdesc_vc_3(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
return
@@ -57,8 +53,7 @@ func.func @test_create_nd_tdesc_vc_3(%src: memref<?x?xf32>, %w : index, %h : ind
// CHECK-LABEL: func @test_create_nd_tdesc_vc_4({{.*}}) {
func.func @test_create_nd_tdesc_vc_4(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
return
@@ -67,8 +62,7 @@ func.func @test_create_nd_tdesc_vc_4(%src: memref<?x?xf32>, %w : index, %h : ind
// CHECK-LABEL: func @test_create_nd_tdesc_vc_5({{.*}}) {
func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc}
: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
@@ -78,8 +72,7 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : ind
// CHECK-LABEL: func @test_create_nd_tdesc_vc_6({{.*}}) {
func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
%1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc}
: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
@@ -89,7 +82,7 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : ind
// CHECK-LABEL: func @test_create_nd_tdesc_vc_7({{.*}}) {
func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
%1 = xegpu.create_nd_tdesc %src[%offset] {mode = vc} : memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
return
@@ -99,7 +92,7 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
// CHECK-LABEL: func @test_create_nd_tdesc_vc_8({{.*}}) {
func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
%c1 = arith.constant 1 : index
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[8, %{{arg[0-9]}}], [%{{arg[0-9]}}, %{{arg[0-9]}}], [%{{arg[0-9]}}, %c1] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
%1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {mode = vc}
: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
@@ -108,7 +101,7 @@ func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : ind
// CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) {
func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[0, 0]
// CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm, array_length = 2>>
%1 = xegpu.create_nd_tdesc %src[0, 0] {mode = vc} : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm, array_length = 2>>
return
diff --git a/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir b/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
deleted file mode 100644
index 8fb5ac824ddb27..00000000000000
--- a/mlir/test/Dialect/XeGPU/IR/create_tdesc.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-opt %s | FileCheck %s
-// Verify the printed output can be parsed.
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
-// Verify the generic form can be parsed.
-// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
-
-// CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
-func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
- %1 = xegpu.create_tdesc %src, %offsets {mode=vc} : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
- return
-}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir b/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
index ff6f31c77064af..8df22fb78996a5 100644
--- a/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/simple_gemm.mlir
@@ -23,12 +23,12 @@ func.func @test_gemm_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16
scf.for %i= %c0 to %c1024 step %c8 {
scf.for %j= %c0 to %c1024 step %c16 {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{c[0-9]}}]
// CHECK-SAME: memref<1024x1024xbf16>
// CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
%1 = xegpu.create_nd_tdesc %a[%i, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{arg[0-9]}}]
// CHECK-SAME: memref<1024x1024xbf16>
// CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %b[%c0, %j] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
@@ -37,33 +37,35 @@ func.func @test_gemm_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16
%tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16 iter_args(%subA = %1, %subB = %2, %subC = %3)
-> (!xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>, vector<8x1xf32>) {
- // CHECK: xegpu.load_nd
- // CHECK-SAME: vector<4x1x2xbf16>
+ // CHECK: xegpu.load_nd %{{arg[0-9]}} {vnni_axis = 1 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> vector<4x1x2xbf16>
%4 = xegpu.load_nd %subA {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a> -> vector<4x1x2xbf16>
- // CHECK: xegpu.load_nd
- // CHECK-SAME: vector<8x1x2xbf16>
+ // CHECK: xegpu.load_nd %{{arg[0-9]}} {vnni_axis = 0 : i64}
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1x2xbf16>
%5 = xegpu.load_nd %subB {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b> -> vector<8x1x2xbf16>
- // CHECK: xegpu.dpas
+ // CHECK: xegpu.dpas %{{[0-9]}}, %{{[0-9]}}, %{{arg[0-9]}}
// CHECK-SAME: vector<4x1x2xbf16>, vector<8x1x2xbf16>, vector<8x1xf32> -> vector<8x1xf32>
%6 = xegpu.dpas %4, %5, %subC : vector<4x1x2xbf16>, vector<8x1x2xbf16>, vector<8x1xf32> -> vector<8x1xf32>
- %7 = xegpu.update_nd_offset %subA, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
- -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
+ // CHECK: xegpu.update_nd_offset %{{arg[0-9]}}, [%{{c[0-9]}}, %{{c[0-9]+}}]
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
+ %7 = xegpu.update_nd_offset %subA, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
- %8 = xegpu.update_nd_offset %subB, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
- -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
+ // CHECK: xegpu.update_nd_offset %{{arg[0-9]}}, [%{{c[0-9]+}}, %{{c[0-9]}}]
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %8 = xegpu.update_nd_offset %subB, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>, !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>, vector<8x1xf32>
}
- // CHECK: xegpu.create_nd_tdesc
- // CHECK-SAME: memref<1024x1024xf32>
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[{{%arg[0-9]}}, %{{arg[0-9]}}]
+ // CHECK-SAME: memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
%9 = xegpu.create_nd_tdesc %c[%i, %j] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
- // CHECK: xegpu.store_nd
- // CHECK-SAME: vector<8x1xf32>
+ // CHECK: xegpu.store_nd %{{[0-9]#2}}, %{{[0-9]}}
+ // CHECK-SAME: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
xegpu.store_nd %result, %9 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
}
}
diff --git a/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir b/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
index 794a6b6f1afb9c..62b972ad189fde 100644
--- a/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
+++ b/mlir/test/Dialect/XeGPU/IR/simple_gemm_vc.mlir
@@ -20,11 +20,11 @@ func.func @test_gemm_vc_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xb
scf.for %i= %c0 to %c1024 step %c8 {
scf.for %j= %c0 to %c1024 step %c16 {
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{arg[0-9]}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
%1 = xegpu.create_nd_tdesc %a[%i, %c0] {mode = vc} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[%{{c[0-9]}}, %{{arg[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
%2 = xegpu.create_nd_tdesc %b[%c0, %j] {mode = vc} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
@@ -33,30 +33,34 @@ func.func @test_gemm_vc_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xb
%tmp0, %tmp1, %result = scf.for %k= %c0 to %c1024 step %c16
iter_args(%subA = %1, %subB = %2, %subC = %3)
-> (!xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>, vector<8x16xf32>) {
- // CHECK: xegpu.load_nd
+ // CHECK: xegpu.load_nd %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>, vnni_axis = 1 : i64}
// CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
%4 = xegpu.load_nd %subA {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16> -> vector<8x8x2xbf16>
- // CHECK: xegpu.load_nd
+ // CHECK: xegpu.load_nd %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>, vnni_axis = 0 : i64}
// CHECK-SAME: !xegpu.tensor_desc<16x16xbf16> -> vector<8x16x2xbf16>
%5 = xegpu.load_nd %subB {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16> -> vector<8x16x2xbf16>
- // CHECK: xegpu.dpas
+ // CHECK: xegpu.dpas %{{[0-9]}}, %{{[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: vector<8x8x2xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
%6 = xegpu.dpas %4, %5, %subC {mode = vc} : vector<8x8x2xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+ // CHECK: xegpu.update_nd_offset %{{arg[0-9]}}, [%{{c[0-9]}}, %{{c[0-9]+}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16> -> !xegpu.tensor_desc<8x16xbf16>
%7 = xegpu.update_nd_offset %subA, [%c0, %c16] {mode = vc} : !xegpu.tensor_desc<8x16xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+ // CHECK: xegpu.update_nd_offset %{{arg[0-9]}}, [%{{c[0-9]+}}, %{{c[0-9]}}] {mode = #xegpu<mode_kind vc>}
+ // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16> -> !xegpu.tensor_desc<16x16xbf16>
%8 = xegpu.update_nd_offset %subB, [%c16, %c0] {mode = vc} : !xegpu.tensor_desc<16x16xbf16> -> !xegpu.tensor_desc<16x16xbf16>
scf.yield %7, %8, %6: !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>, vector<8x16xf32>
}
- // CHECK: xegpu.create_nd_tdesc
+ // CHECK: xegpu.create_nd_tdesc %{{arg[0-9]}}[{{%arg[0-9]}}, %{{arg[0-9]}}] {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
%9 = xegpu.create_nd_tdesc %c[%i, %j] {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: xegpu.store_nd
+ // CHECK: xegpu.store_nd %{{[0-9]#2}}, %{{[0-9]}} {mode = #xegpu<mode_kind vc>}
// CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %result, %9 {mode = vc}: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
}
diff --git a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir b/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
deleted file mode 100644
index 6d98ac3950c31f..00000000000000
--- a/mlir/test/Dialect/XeGPU/IR/store_scatter.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: mlir-opt %s | FileCheck %s
-// Verify the printed output can be parsed.
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
-// Verify the generic form can be parsed.
-// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
-
-// CHECK-LABEL: func @test_store_scatter({{.*}}) {
-func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui64) {
- %0 = arith.constant dense<true>: vector<16xi1>
- // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
- // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
- %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
- : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-
- // CHECK: xegpu.create_tdesc %{{arg[0-9]}}, %{{arg[0-9]}} {mode = #xegpu<mode_kind vc>}
- // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
- %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
- : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-
- // CHECK: xegpu.load %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind cached>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
- // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
- %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
- : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
- // CHECK: xegpu.store %{{[0-9]}}, %{{[0-9]}}, %{{.*}} {l1_hint = #xegpu<cache_kind write_back>, l2_hint = #xegpu<cache_kind uncached>, mode = #xegpu<mode_kind vc>}
- // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
- xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
- : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
- return
-}
>From a49d68c0a776bc293a3a443e9e2f6236c9bfb868 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Wed, 21 Feb 2024 15:11:19 -0600
Subject: [PATCH 3/4] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 766590f6a3f878..1fc95417196ddf 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -224,7 +224,7 @@ def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
}
def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure]> {
- let summary = "create scattered tensor descritors (TensorDesc).";
+ let summary = "create scattered tensor descriptors (TensorDesc).";
let description = [{
"create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
>From 795a59924914687899b4d9d0cb5a8de135213f67 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3 at users.noreply.github.com>
Date: Wed, 21 Feb 2024 15:12:31 -0600
Subject: [PATCH 4/4] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index b3dceff9587ada..1bc90edb1dc2b0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -61,7 +61,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
blocked, rows are continuous in the correspoding dimention, otherwise, rows may be not continous.
* mapping (xegpu::SubGroupMapAttr): [optional] Used to guide compiler to distribute the workload into different threads. It is default to none.
- For convinience, its attribute field can also take either "ScatteredAttr" or "SubGroupMapAttr" directly if and only
+ For convenience, its attribute field can also take either "ScatteredAttr" or "SubGroupMapAttr" directly if and only
if others are taking default values.
Syntax:
More information about the Mlir-commits
mailing list