[Mlir-commits] [mlir] [mlir][xegpu] Add definition of SliceAttr (PR #150146)
Chao Chen
llvmlistbot at llvm.org
Fri Aug 8 09:04:03 PDT 2025
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/150146
>From 2bc70b6a8487a8ce0f0e7e0c5ac5bc59035465ab Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 19:46:04 +0000
Subject: [PATCH 01/29] add definition draft of SliceAttr
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 42b5b7a0d4e3f..abbd227b9905f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -330,4 +330,25 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let genVerifyDecl = 1;
}
+
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+ let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
+
+ let description = [{
+ Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
+ However, whereas LayoutAttr requires the data to have the same rank as the attribute,
+ SliceAttr permits the data to have a lower rank. In this case, compute units in the
+ specified dimensions share the data, provided that the remaining ranks match the data
+ rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
+ vector.broadcast.
+ }];
+
+ let parameters = (ins
+ "Attribute": $parent,
+ "DenseI64ArrayAttr": $dims
+ );
+
+ let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
>From 3959f9e5027f7c21f420c44a5e34501c115df361 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 21:02:22 +0000
Subject: [PATCH 02/29] add layout traits
---
mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt | 6 ++++++
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 +
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 11 +++++++++--
mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt | 1 +
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 1 +
5 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index 3f8cac4dc07c3..bbbeb71410a9b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -12,3 +12,9 @@ mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
add_public_tablegen_target(MLIRXeGPUEnumsIncGen)
add_dependencies(mlir-headers MLIRXeGPUEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
+mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
+mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
+add_public_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
+add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 8e2784f40ad39..cc8d58d8975b4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -25,6 +25,7 @@ class TensorDescType;
} // namespace xegpu
} // namespace mlir
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
#define GET_ATTRDEF_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index abbd227b9905f..b15dd4a3177f9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -169,7 +169,14 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}
-def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
+def LayoutTrait: AttrInterface<"LayoutTrait"> {
+ let cppNamespace = "::mlir::xegpu";
+ let description = [{
+ Common trait for all XeGPU layouts.
+ }];
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
let summary = [{
Describes the data distribution to subgroups and work-items for a tensor
specified by the tensor descriptor.
@@ -331,7 +338,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
}
-def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
let description = [{
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 242a97ccfdf6d..89d986143e965 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
DEPENDS
MLIRXeGPUIncGen
+ MLIRXeGPUAttrInterfaceIncGen
MLIRXeGPUAttrsIncGen
MLIRXeGPUEnumsIncGen
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 78cbf884a1911..63160c98105c3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -753,6 +753,7 @@ LogicalResult ConvertLayoutOp::verify() {
} // namespace xegpu
} // namespace mlir
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
#define GET_OP_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
>From 2027cfc98321d8f68a713340cd652ab10625cfee Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 23:46:10 +0000
Subject: [PATCH 03/29] add verifier and interface
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 54 ++++++++++++++++++-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 21 ++++++++
2 files changed, 74 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index b15dd4a3177f9..e3b06714bdcc2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -174,6 +174,17 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
let description = [{
Common trait for all XeGPU layouts.
}];
+
+ let methods = [
+ InterfaceMethod<"Get the effective sg layout",
+ "std::optional<llvm::SmallVector<int>>",
+ "getEffectiveSgLayout">,
+ InterfaceMethod<"Get the effective sg data",
+ "std::optional<llvm::SmallVector<int>>",
+ "getEffectiveSgData">,
+ ];
+
+
}
def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -331,6 +342,18 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr,
getLaneLayout(), getLaneData(), getOrder());
}
+
+ std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+ if (DenseI32ArrayAttr layout = getSgLayout())
+ return llvm::to_vector(layout.asArrayRef());
+ return std::nullopt;
+ }
+
+ std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+ if (DenseI32ArrayAttr data = getSgData())
+ return llvm::to_vector(data.asArrayRef());
+ return std::nullopt;
+ }
}];
let assemblyFormat = "`<` struct(params) `>`";
@@ -351,11 +374,40 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
}];
let parameters = (ins
- "Attribute": $parent,
+ "xegpu::LayoutAttr": $parent,
"DenseI64ArrayAttr": $dims
);
+ let extraClassDeclaration = [{
+ std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+ if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
+ llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+ llvm::SmallVector<int32_t> result;
+ for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
+ if (!llvm::is_contained(dims, i))
+ result.push_back(v);
+ }
+ return result;
+ }
+ return std::nullopt;
+ }
+ std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+ if (DenseI32ArrayAttr data = getParent().getSgData()) {
+ llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+ llvm::SmallVector<int32_t> result;
+ for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
+ if (!llvm::is_contained(dims, i))
+ result.push_back(v);
+ }
+ return result;
+ }
+ return std::nullopt;
+
+ }
+ }];
+
let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+ let genVerifyDecl = 1;
}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 642c393cbc2c8..7e293b6f0e1a3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -206,6 +206,27 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return success();
}
+//===----------------------------------------------------------------------===//
+// XeGPU_SliceAttr
+//===----------------------------------------------------------------------===//
+LogicalResult
+SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+ xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
+ if (!parent || !dims)
+ return emitError() << "expected parent layout and dims attribute";
+
+ int rank = parent.getRank();
+ // check every element in dims is unique and smaller than rank
+ llvm::SmallDenseSet<int64_t> seen;
+ for (int64_t dim : dims.asArrayRef()) {
+ if (dim >= rank)
+ return emitError() << "invalid dim: " << dim;
+ if (!seen.insert(dim).second)
+ return emitError() << "repeated dim: " << dim;
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
//===----------------------------------------------------------------------===//
>From 638c0853dc2b76fbc01d8410cd6bb52aa7d20891 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 15:52:26 +0000
Subject: [PATCH 04/29] add invalid unit test
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
mlir/test/Dialect/XeGPU/invalid.mlir | 19 +++++++++++++++++++
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index e3b06714bdcc2..d0b2e936d6508 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -406,7 +406,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
}
}];
- let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+ let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7e293b6f0e1a3..21007f98643bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -220,9 +220,9 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
llvm::SmallDenseSet<int64_t> seen;
for (int64_t dim : dims.asArrayRef()) {
if (dim >= rank)
- return emitError() << "invalid dim: " << dim;
+ return emitError() << "invalid dim (" << dim << ") in slice attribute.";
if (!seen.insert(dim).second)
- return emitError() << "repeated dim: " << dim;
+ return emitError() << "repeated dim (" << dim << ") in slice attribute.";
}
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index eb564d55bfd51..c4e72820e9aec 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -658,3 +658,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
#xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
return
}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{repeated dim (2) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [2, 2]>
+func.func @slice_attr_repeat_dim() {
+ %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+ return
+}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{invalid dim (3) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [3]>
+func.func @slice_attr_repeat_dim() {
+ %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+ return
+}
+
>From 91048f06417bd8af3d58d35a516115da044e6451 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 16:06:59 +0000
Subject: [PATCH 05/29] add wrappers
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index d0b2e936d6508..a38878bc6a61f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,8 +183,6 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
"std::optional<llvm::SmallVector<int>>",
"getEffectiveSgData">,
];
-
-
}
def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -402,7 +400,18 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
return result;
}
return std::nullopt;
+ }
+
+ DenseI32ArrayAttr getOrder() const {
+ return getParent().getOrder();
+ }
+
+ bool isWgLayout() const {
+ return getParent().isWgLayout();
+ }
+ bool isSgLayout() const {
+ return getParent().isSgLayout();
}
}];
>From ddc42c2886ae3c49f10032caea27817dc6d542de Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 17:51:42 +0000
Subject: [PATCH 06/29] update description
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 78a7c48af837e..8644be8e4204c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,7 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
"getEffectiveSgLayout">,
InterfaceMethod<"Get the effective sg data",
"std::optional<llvm::SmallVector<int>>",
- "getEffectiveSgData">,
+ "getEffectiveSgData">
];
}
@@ -375,6 +375,16 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
specified dimensions share the data, provided that the remaining ranks match the data
rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
vector.broadcast.
+
+ Example:
+ ```
+ #l = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+ #r = #xegpu.slice<#l, dim = 0>
+
+ %exp = math.exp %input {layout_result_0 = #l}: vector<256x128xf32>
+ %red = vector.multi_reduction<add>, %exp, %acc [0] {layout_result_0 = #r}: vector<256x128xf32> to vector<128xf32>
+ %bcast = vector.broadcast %red {layout_result_0 = #l} : vector<128xf32> to vector<256x128xf32>
+ ```
}];
let parameters = (ins
>From 36e2c3a118b0167c6e4f3341533f92353ddaebe2 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 18:44:08 +0000
Subject: [PATCH 07/29] refactor
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 6 +++---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 15 +++------------
.../include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 12 ++++++++++++
3 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index cc8d58d8975b4..c2d546fa08fe0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -22,18 +22,18 @@
namespace mlir {
namespace xegpu {
class TensorDescType;
+class LayoutAttr;
} // namespace xegpu
} // namespace mlir
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+
#define GET_ATTRDEF_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
#define GET_TYPEDEF_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc>
-
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-
#define GET_OP_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 8644be8e4204c..36a12a2c2a029 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -396,24 +396,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
- llvm::SmallVector<int32_t> result;
- for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
- if (!llvm::is_contained(dims, i))
- result.push_back(v);
- }
- return result;
+ return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
}
return std::nullopt;
}
+
std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
if (DenseI32ArrayAttr data = getParent().getSgData()) {
llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
- llvm::SmallVector<int32_t> result;
- for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
- if (!llvm::is_contained(dims, i))
- result.push_back(v);
- }
- return result;
+ return XeGPUDialect::dropDims(data.asArrayRef(), dims);
}
return std::nullopt;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 549018b61d6fb..f07a758a59b96 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -41,6 +41,18 @@ def XeGPU_Dialect : Dialect {
/// Checks if the given shape can be evenly distributed based on the layout
/// and data factors provided by the LayoutAttr.
static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
+
+ /// drops the data in the specified dimension, and return the rest. e.g.,
+ /// for data = [32, 64, 8], dropPositions = [0, 2], it will return [64]
+ template<typename T, typename U>
+ static llvm::SmallVector<T> dropDims(llvm::ArrayRef<T> data, llvm::ArrayRef<U> dropPositions) {
+ llvm::SmallVector<T> result;
+ for (auto [i, v]: llvm::enumerate(data)) {
+ if (!llvm::is_contained(dropPositions, i))
+ result.push_back(v);
+ }
+ return result;
+ }
}];
}
>From 6872e6dbda83d21d960ffb2c5156e89b1381fdfd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 20:26:39 +0000
Subject: [PATCH 08/29] add delinearizeSubgroupId interface
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 +
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 13 ++++++++++++-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 19 +++++++++++++++++++
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 ++
.../Transforms/XeGPUWgToSgDistribute.cpp | 2 +-
5 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index c2d546fa08fe0..57919966a90b2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -15,6 +15,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
#include "mlir/Interfaces/ShapedOpInterfaces.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 36a12a2c2a029..96466550cb703 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,11 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
"getEffectiveSgLayout">,
InterfaceMethod<"Get the effective sg data",
"std::optional<llvm::SmallVector<int>>",
- "getEffectiveSgData">
+ "getEffectiveSgData">,
+ InterfaceMethod<"Delinearize the Subgroup Id",
+ "FailureOr<SmallVector<Value>>",
+ "delinearizeSubgroupId",
+ (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
];
}
@@ -358,6 +362,10 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
return llvm::to_vector(data.asArrayRef());
return std::nullopt;
}
+
+ FailureOr<SmallVector<Value>>
+ delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
}];
let assemblyFormat = "`<` struct(params) `>`";
@@ -409,6 +417,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
return std::nullopt;
}
+ FailureOr<llvm::SmallVector<Value>>
+ delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
DenseI32ArrayAttr getOrder() const {
return getParent().getOrder();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 836478a807761..974e42140e54e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -211,6 +212,18 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return success();
}
+FailureOr<SmallVector<Value>>
+LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
+ OpBuilder &builder) {
+ assert(isWgLayout() && "delinearizeSubgroupId is only available for "
+ "workgroup-level layout attribute.");
+ auto dims =
+ llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
+ return arith::ConstantIndexOp::create(builder, loc, d);
+ });
+ return affine::delinearizeIndex(builder, loc, linearId, dims);
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_SliceAttr
//===----------------------------------------------------------------------===//
@@ -232,6 +245,12 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return success();
}
+FailureOr<SmallVector<Value>>
+SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
+ OpBuilder &builder) {
+ return getParent().delinearizeSubgroupId(linearId, loc, builder);
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index edc18025136ac..a7013ed470cab 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -838,7 +838,9 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
} // namespace xegpu
} // namespace mlir
+namespace mlir {
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
+} // namespace mlir
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
#define GET_OP_CLASSES
#include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index ef52323a9f46b..2168d43eb701b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -175,7 +175,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
}
auto deLinearizeSgId =
- affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
+ layout.delinearizeSubgroupId(linearSgId, loc, rewriter);
if (failed(deLinearizeSgId))
return failure();
SmallVector<Value> sgIds = *deLinearizeSgId;
>From 223fab912e9987e7a7ed7440fb6fd42b2d0a4dd8 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 21:05:46 +0000
Subject: [PATCH 09/29] fix format
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 57919966a90b2..eb74b8142688f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -27,8 +27,8 @@ class LayoutAttr;
} // namespace xegpu
} // namespace mlir
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
#define GET_ATTRDEF_CLASSES
>From 60e20a02b991a4276f74937ea69c483d780d2e49 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 24 Jul 2025 23:33:27 +0000
Subject: [PATCH 10/29] add impl of getOffsets for LayoutAttr
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 65 +++++++++------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 80 +++++++++++++++++--
.../Transforms/XeGPUWgToSgDistribute.cpp | 2 +-
3 files changed, 113 insertions(+), 34 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 94a294fdc5705..5794f786dc9b9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,15 +183,20 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
let methods = [
InterfaceMethod<"Get the effective sg layout",
- "std::optional<llvm::SmallVector<int>>",
+ "std::optional<SmallVector<int64_t>>",
"getEffectiveSgLayout">,
InterfaceMethod<"Get the effective sg data",
- "std::optional<llvm::SmallVector<int>>",
+ "std::optional<SmallVector<int64_t>>",
"getEffectiveSgData">,
InterfaceMethod<"Delinearize the Subgroup Id",
"FailureOr<SmallVector<Value>>",
"delinearizeSubgroupId",
- (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
+ (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
+
+ InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
+ "FailureOr<SmallVector<SmallVector<Value>>>",
+ "getOffsets",
+ (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
];
}
@@ -351,20 +356,23 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
getLaneLayout(), getLaneData(), getOrder());
}
- std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+ std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
if (DenseI32ArrayAttr layout = getSgLayout())
- return llvm::to_vector(layout.asArrayRef());
+ return llvm::to_vector_of<int64_t>(layout.asArrayRef());
return std::nullopt;
}
- std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+ std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
if (DenseI32ArrayAttr data = getSgData())
- return llvm::to_vector(data.asArrayRef());
+ return llvm::to_vector_of<int64_t>(data.asArrayRef());
return std::nullopt;
}
FailureOr<SmallVector<Value>>
- delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+ delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+ FailureOr<SmallVector<SmallVector<Value>>>
+ getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
}];
@@ -401,24 +409,6 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
);
let extraClassDeclaration = [{
- std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
- if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
- llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
- return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
- }
- return std::nullopt;
- }
-
- std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
- if (DenseI32ArrayAttr data = getParent().getSgData()) {
- llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
- return XeGPUDialect::dropDims(data.asArrayRef(), dims);
- }
- return std::nullopt;
- }
-
- FailureOr<llvm::SmallVector<Value>>
- delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
DenseI32ArrayAttr getOrder() const {
return getParent().getOrder();
@@ -431,6 +421,29 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
bool isSgLayout() const {
return getParent().isSgLayout();
}
+
+ std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+ if (auto layout = getParent().getEffectiveSgLayout()) {
+ ArrayRef<int64_t> dims = getDims().asArrayRef();
+ return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*layout), dims);
+ }
+ return std::nullopt;
+ }
+
+ std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+ if (auto data = getParent().getEffectiveSgData()) {
+ ArrayRef<int64_t> dims = getDims().asArrayRef();
+ return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*data), dims);
+ }
+ return std::nullopt;
+ }
+
+ FailureOr<SmallVector<Value>>
+ delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+ FailureOr<SmallVector<SmallVector<Value>>>
+ getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+
}];
let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 91d7b2a137efd..682f0620dbcfb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -213,17 +215,75 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
}
FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
- OpBuilder &builder) {
- assert(isWgLayout() && "delinearizeSubgroupId is only available for "
- "workgroup-level layout attribute.");
+LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+ Value linearId) {
+ // delinearizeSubgroupId is only available for workgroup-level layout
+ // attribute
+ if (!isWgLayout())
+ return failure();
+
auto dims =
llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
return arith::ConstantIndexOp::create(builder, loc, d);
});
+
return affine::delinearizeIndex(builder, loc, linearId, dims);
}
+FailureOr<SmallVector<SmallVector<Value>>>
+LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+ ArrayRef<int64_t> shape) {
+ if (!isWgLayout())
+ return failure();
+
+ auto sgLayout = getEffectiveSgLayout().value();
+ SmallVector<int64_t> sgShape;
+ if (auto maybeSgShape = getEffectiveSgData())
+ sgShape = maybeSgShape.value();
+ else if (auto ratio = computeShapeRatio(shape, sgLayout))
+ sgShape = ratio.value();
+ else
+ return failure();
+
+ // distUnit[i] is the minimum value between shape[i] and
+ // sgLayout[i] * sgShape[i]
+ SmallVector<int64_t> distUnit = llvm::map_to_vector(
+ llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+ [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+ // delinearize Ids
+ auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+ if (failed(maybeIds))
+ return failure();
+ SmallVector<Value> sgIds = *maybeIds;
+
+ // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+ SmallVector<Value> localOffsets = llvm::map_to_vector(
+ llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+ auto &[id, s] = t;
+ Value d = arith::ConstantIndexOp::create(builder, loc, s);
+ return index::MulOp::create(builder, loc, id, d);
+ });
+
+ SmallVector<SmallVector<Value>> offsets;
+ for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+ SmallVector<Value> base =
+ llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+ return arith::ConstantIndexOp::create(builder, loc, d);
+ });
+
+ SmallVector<Value> adds = llvm::map_to_vector(
+ llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+ return arith::AddIOp::create(builder, loc, std::get<0>(t),
+ std::get<1>(t));
+ });
+
+ offsets.push_back(adds);
+ }
+
+ return offsets;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_SliceAttr
//===----------------------------------------------------------------------===//
@@ -246,9 +306,15 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
}
FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
- OpBuilder &builder) {
- return getParent().delinearizeSubgroupId(linearId, loc, builder);
+SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+ Value linearId) {
+ return getParent().delinearizeSubgroupId(builder, loc, linearId);
+}
+
+FailureOr<SmallVector<SmallVector<Value>>>
+SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+ ArrayRef<int64_t> shape) {
+ return failure();
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f914914dc6b9f..e3cf5473076e7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -213,7 +213,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
}
auto deLinearizeSgId =
- layout.delinearizeSubgroupId(adjustedSgId, loc, rewriter);
+ layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
if (failed(deLinearizeSgId))
return failure();
SmallVector<Value> sgIds = *deLinearizeSgId;
>From 3630966307810ff8ee47aa7d95328ebba225724e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:25:52 +0000
Subject: [PATCH 11/29] apply getOffsets in CreateNdDescOp
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 29 +++++++-----
.../Transforms/XeGPUWgToSgDistribute.cpp | 44 +++++++++----------
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 42 +++++++++---------
3 files changed, 60 insertions(+), 55 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 682f0620dbcfb..0b5ecfc210281 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -217,14 +217,14 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
FailureOr<SmallVector<Value>>
LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
Value linearId) {
- // delinearizeSubgroupId is only available for workgroup-level layout
- // attribute
+ // delinearizeSubgroupId is only available for
+ // workgroup-level layout attribute
if (!isWgLayout())
return failure();
auto dims =
- llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
- return arith::ConstantIndexOp::create(builder, loc, d);
+ llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
+ return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
});
return affine::delinearizeIndex(builder, loc, linearId, dims);
@@ -260,25 +260,32 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
// nd local offset, localOffset[i] = sgId[i] * sgShape[i]
SmallVector<Value> localOffsets = llvm::map_to_vector(
llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
- auto &[id, s] = t;
- Value d = arith::ConstantIndexOp::create(builder, loc, s);
- return index::MulOp::create(builder, loc, id, d);
+ return builder.createOrFold<index::MulOp>(
+ loc, std::get<0>(t),
+ builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
});
SmallVector<SmallVector<Value>> offsets;
for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
SmallVector<Value> base =
llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
- return arith::ConstantIndexOp::create(builder, loc, d);
+ return builder.create<arith::ConstantIndexOp>(loc, d);
});
SmallVector<Value> adds = llvm::map_to_vector(
llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
- return arith::AddIOp::create(builder, loc, std::get<0>(t),
- std::get<1>(t));
+ return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+ std::get<1>(t));
});
- offsets.push_back(adds);
+ SmallVector<Value> mods = llvm::map_to_vector(
+ llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+ return builder.createOrFold<index::RemUOp>(
+ loc, std::get<0>(t),
+ builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+ });
+
+ offsets.push_back(mods);
}
return offsets;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index e3cf5473076e7..af55f176cb84f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -212,39 +212,39 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
}
- auto deLinearizeSgId =
- layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
- if (failed(deLinearizeSgId))
+ auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
+ if (failed(tdescOffsets))
return failure();
- SmallVector<Value> sgIds = *deLinearizeSgId;
-
- // Calculate distribution unit shape and local offsets for subgroup
- SmallVector<int64_t> distUnitShape(sgLayout.size());
- SmallVector<Value> localOffset(sgLayout.size());
- for (size_t i = 0; i < sgLayout.size(); i++) {
- distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], wgShape[i]);
- localOffset[i] =
- rewriter.createOrFold<index::MulOp>(loc, sgIds[i], sgDataDim[i]);
- }
-
- SmallVector<OpFoldResult> originalOffsets = op.getMixedOffsets();
xegpu::TensorDescType newTdescTy =
xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
layout.dropSgLayoutAndData());
+
SmallVector<Value> newCreateNdOps;
- for (SmallVector<int64_t> distUnitBaseAddr :
- StaticTileOffsetRange(wgShape, distUnitShape)) {
- SmallVector<OpFoldResult> globalOffsets =
- calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
- distUnitBaseAddr, distUnitShape);
+ SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+
+ for (auto tdescOffset : *tdescOffsets) {
+ SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
+ llvm::zip_longest(tdescOffset, offset),
+ [&](const auto &t) -> OpFoldResult {
+ std::optional<Value> off = std::get<0>(t);
+ std::optional<OpFoldResult> old = std::get<1>(t);
+ if (!off.has_value())
+ return *old;
+
+ if (!old.has_value() || isZeroInteger(*old))
+ return *off;
+
+ return rewriter.createOrFold<index::AddOp>(
+ loc, *off,
+ getValueOrCreateConstantIndexOp(rewriter, loc, *old));
+ });
auto newCreateNdOp = xegpu::CreateNdDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), globalOffsets,
+ rewriter, loc, newTdescTy, op.getSource(), newOffsets,
op.getMixedSizes(), op.getMixedStrides());
newCreateNdOps.push_back(newCreateNdOp);
}
-
rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index d51122417fb61..5e6a227e92320 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -4,27 +4,25 @@
//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
gpu.module @test_1_1_assignment {
// CHECK-LABEL: create_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
+ // CHECK-SAME: [[ARG_0:%.*]]: memref<24x32xf32>
gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
- // CHECK: %[[SGID:.*]] = gpu.subgroup_id
- // CHECK: %[[C12:.*]] = arith.constant 12 : index
- // CHECK: %[[C4:.*]] = arith.constant 4 : index
- // CHECK: %[[C8:.*]] = arith.constant 8 : index
- // CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
- // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
- // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
- // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
- // CHECK: %[[C24:.*]] = arith.constant 24 : index
- // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
- // CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
- // CHECK: %[[C32:.*]] = arith.constant 32 : index
- // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
- // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
- // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
- // CHECK: gpu.return
+ //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+ //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+ //CHECK: [[C12:%.+]] = arith.constant 12 : index
+ //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C12]]
+ //CHECK: [[C8:%.+]] = arith.constant 8 : index
+ //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C8]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+ //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+ //CHECK: [[C24:%.+]] = arith.constant 24 : index
+ //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C24]]
+ //CHECK: [[C32:%.+]] = arith.constant 32 : index
+ //CHECK: [[X:%.+]] = index.remu [[UX]], [[C32]]
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<24x32xf32> -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
+
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
gpu.return
@@ -180,7 +178,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
-> vector<24x1xf32>
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
// CHECK-SAME: : vector<12x1xf32> to vector<12x8xf32>
- %broadcast = vector.broadcast %load
+ %broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 8], lane_layout = [2, 1], lane_data = [1, 1]>}
: vector<24x1xf32> to vector<24x8xf32>
gpu.return
@@ -367,7 +365,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
// CHECK-LABEL: @subgroup_id_range_nested_if
gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
%sg_id = gpu.subgroup_id : index
- %c1 = arith.constant 1 : i1
+ %c1 = arith.constant 1 : i1
%c3 = arith.constant 3 : index
%c32 = arith.constant 32 : index
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
>From 398d69beac1e69ef72f23dea5b5649e4dc9a0ffd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:32:43 +0000
Subject: [PATCH 12/29] cleanup
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 1 +
.../Transforms/XeGPUWgToSgDistribute.cpp | 59 +++----------------
2 files changed, 8 insertions(+), 52 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0b5ecfc210281..ef336ce800385 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -222,6 +222,7 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
if (!isWgLayout())
return failure();
+ // TODO: handle order attribute
auto dims =
llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af55f176cb84f..640d74d3e3715 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -125,39 +125,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
- // Calculate offset for each subgroup
- static SmallVector<OpFoldResult>
- calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
- const SmallVector<OpFoldResult> &originalOffsets,
- const SmallVector<Value> &localOffset,
- const SmallVector<int64_t> &distUnitBaseAddr,
- const SmallVector<int64_t> &distUnitShape) {
- assert(localOffset.size() == distUnitBaseAddr.size() &&
- "localOffset and distUnitBaseAddr must have the same rank");
-
- SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
- originalOffsets.end());
- size_t rank = localOffset.size();
- for (size_t i = 0; i < rank; ++i) {
- size_t dimIdx = originalOffsets.size() - rank + i;
- Value constOffset =
- arith::ConstantIndexOp::create(rewriter, loc, distUnitBaseAddr[i]);
- Value offset =
- rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
- Value modValue =
- arith::ConstantIndexOp::create(rewriter, loc, distUnitShape[i]);
- Value offsetMod =
- rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
- Value origOffset = getValueOrCreateConstantIndexOp(
- rewriter, loc, originalOffsets[dimIdx]);
- Value globalOffset =
- rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
- globalOffsets[dimIdx] = globalOffset;
- }
-
- return globalOffsets;
- }
-
LogicalResult
matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
@@ -177,28 +144,14 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
return rewriter.notifyMatchFailure(
op, "sgLayout attribute is required in layout");
- SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
-
- // TODO : Handle order attribute
// Get the subgroup ID
- auto linearSgId =
+ Value linearSgId =
gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
- // Create constants for layout dimensions
- SmallVector<Value> sgLayoutDim(sgLayout.size());
- SmallVector<Value> sgDataDim(sgShape.size());
-
- for (size_t i = 0; i < sgLayout.size(); i++) {
- sgLayoutDim[i] =
- arith::ConstantIndexOp::create(rewriter, loc, sgLayout[i]);
- sgDataDim[i] = arith::ConstantIndexOp::create(rewriter, loc, sgShape[i]);
- }
-
int64_t startOfRange = -1, endOfRange = -1;
bool sgIdRangeSpecified =
isSgIdRangeSpecified(op, startOfRange, endOfRange);
- Value adjustedSgId = linearSgId;
if (sgIdRangeSpecified) {
int64_t sgCount = endOfRange - startOfRange;
if (computeProduct(sgLayout) != sgCount)
@@ -208,14 +161,16 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
// sg id
Value startOfRangeVal =
rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
- adjustedSgId =
+ linearSgId =
rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
}
- auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
- if (failed(tdescOffsets))
+ auto maybeTdescOffsets =
+ layout.getOffsets(rewriter, loc, linearSgId, wgShape);
+ if (failed(maybeTdescOffsets))
return failure();
+ SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
xegpu::TensorDescType newTdescTy =
xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
layout.dropSgLayoutAndData());
@@ -223,7 +178,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
SmallVector<Value> newCreateNdOps;
SmallVector<OpFoldResult> offset = op.getMixedOffsets();
- for (auto tdescOffset : *tdescOffsets) {
+ for (auto tdescOffset : *maybeTdescOffsets) {
SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
llvm::zip_longest(tdescOffset, offset),
[&](const auto &t) -> OpFoldResult {
>From 08e4aa9c6df06e5d7eec54c63c96877dcc1631ac Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:28:40 +0000
Subject: [PATCH 13/29] fix a bug
---
.../Transforms/XeGPUWgToSgDistribute.cpp | 30 ++++++++-----------
1 file changed, 12 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 640d74d3e3715..688e2b25867b3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -179,26 +179,20 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
SmallVector<OpFoldResult> offset = op.getMixedOffsets();
for (auto tdescOffset : *maybeTdescOffsets) {
- SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
- llvm::zip_longest(tdescOffset, offset),
- [&](const auto &t) -> OpFoldResult {
- std::optional<Value> off = std::get<0>(t);
- std::optional<OpFoldResult> old = std::get<1>(t);
- if (!off.has_value())
- return *old;
-
- if (!old.has_value() || isZeroInteger(*old))
- return *off;
-
- return rewriter.createOrFold<index::AddOp>(
- loc, *off,
- getValueOrCreateConstantIndexOp(rewriter, loc, *old));
- });
-
- auto newCreateNdOp = xegpu::CreateNdDescOp::create(
+ SmallVector<OpFoldResult> newOffsets;
+ size_t rank = tdescOffset.size();
+ for (size_t i = 0; i < rank; i++) {
+ size_t idx = offset.size() - rank + i;
+ Value newOff = rewriter.createOrFold<index::AddOp>(
+ loc, tdescOffset[i],
+ getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
+ newOffsets.push_back(newOff);
+ }
+
+ auto newOp = xegpu::CreateNdDescOp::create(
rewriter, loc, newTdescTy, op.getSource(), newOffsets,
op.getMixedSizes(), op.getMixedStrides());
- newCreateNdOps.push_back(newCreateNdOp);
+ newCreateNdOps.push_back(newOp);
}
rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
return success();
>From 62aa1dde2f1c47bf3d9b45582c668c33ef64a987 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:36:15 +0000
Subject: [PATCH 14/29] cleanup
---
.../Transforms/XeGPUWgToSgDistribute.cpp | 20 +++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 688e2b25867b3..dae1f06a8fbad 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -157,8 +157,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
if (computeProduct(sgLayout) != sgCount)
return rewriter.notifyMatchFailure(
op, "sg_layout size must match the sg_id_range");
- // Subtract startOfRange from the original subgroup id to get the adjusted
- // sg id
+ // Subtract startOfRange from the original subgroup id to get
+ // the adjusted sg id
Value startOfRangeVal =
rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
linearSgId =
@@ -176,17 +176,17 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
layout.dropSgLayoutAndData());
SmallVector<Value> newCreateNdOps;
- SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+ SmallVector<OpFoldResult> oldOffsets = op.getMixedOffsets();
- for (auto tdescOffset : *maybeTdescOffsets) {
+ for (auto tdescOffsets : *maybeTdescOffsets) {
SmallVector<OpFoldResult> newOffsets;
- size_t rank = tdescOffset.size();
+ size_t rank = tdescOffsets.size();
for (size_t i = 0; i < rank; i++) {
- size_t idx = offset.size() - rank + i;
- Value newOff = rewriter.createOrFold<index::AddOp>(
- loc, tdescOffset[i],
- getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
- newOffsets.push_back(newOff);
+ size_t idx = oldOffsets.size() - rank + i;
+ Value add = rewriter.createOrFold<index::AddOp>(
+ loc, tdescOffsets[i],
+ getValueOrCreateConstantIndexOp(rewriter, loc, oldOffsets[idx]));
+ newOffsets.push_back(add);
}
auto newOp = xegpu::CreateNdDescOp::create(
>From de0a1bbc63ac3eb04ae1e900a892dba8d03005f0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 17:18:09 +0000
Subject: [PATCH 15/29] add unit test
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 4 +
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 65 ++++++++++-
mlir/test/Dialect/XeGPU/layout.mlir | 6 +
.../Dialect/XeGPU/xegpu-attr-interface.mlir | 23 ++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 107 ++++++++++++++++++
5 files changed, 203 insertions(+), 2 deletions(-)
create mode 100644 mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5794f786dc9b9..4f35e3ff061a4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -410,6 +410,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
let extraClassDeclaration = [{
+ int64_t getRank() const {
+ return getParent().getRank() - getDims().size();
+ }
+
DenseI32ArrayAttr getOrder() const {
return getParent().getOrder();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ef336ce800385..fad3c6280fbbe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -296,7 +296,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
// XeGPU_SliceAttr
//===----------------------------------------------------------------------===//
LogicalResult
-SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
if (!parent || !dims)
return emitError() << "expected parent layout and dims attribute";
@@ -322,7 +322,68 @@ SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
FailureOr<SmallVector<SmallVector<Value>>>
SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
ArrayRef<int64_t> shape) {
- return failure();
+ assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
+ if (!isWgLayout())
+ return failure();
+
+ auto sgLayout = getEffectiveSgLayout().value();
+
+ SmallVector<int64_t> sgShape;
+ if (auto maybeSgShape = getEffectiveSgData())
+ sgShape = maybeSgShape.value();
+ else if (auto ratio = computeShapeRatio(shape, sgLayout))
+ sgShape = ratio.value();
+ else
+ return failure();
+
+ // distUnit[i] is the minimum value between shape[i] and
+ // sgLayout[i] * sgShape[i]
+ SmallVector<int64_t> distUnit = llvm::map_to_vector(
+ llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+ [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+ // delinearize Ids
+ auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+ if (failed(maybeIds))
+ return failure();
+ // The effective sgIds for offsets computing correspond
+ // to the dims that are not sliced.
+ ArrayRef<int64_t> dims = getDims().asArrayRef();
+ SmallVector<Value> sgIds =
+ XeGPUDialect::dropDims(ArrayRef<Value>(*maybeIds), dims);
+
+ // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+ SmallVector<Value> localOffsets = llvm::map_to_vector(
+ llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+ return builder.createOrFold<index::MulOp>(
+ loc, std::get<0>(t),
+ builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+ });
+
+ SmallVector<SmallVector<Value>> offsets;
+ for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+ SmallVector<Value> base =
+ llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+ return builder.create<arith::ConstantIndexOp>(loc, d);
+ });
+
+ SmallVector<Value> adds = llvm::map_to_vector(
+ llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+ return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+ std::get<1>(t));
+ });
+
+ SmallVector<Value> mods = llvm::map_to_vector(
+ llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+ return builder.createOrFold<index::RemUOp>(
+ loc, std::get<0>(t),
+ builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+ });
+
+ offsets.push_back(mods);
+ }
+
+ return offsets;
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 017dacc8d629a..e5330951b065a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,4 +50,10 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
gpu.return
}
+gpu.func @slice_attr_repeat_dim() {
+ //CHECK: arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+ gpu.return
+}
+
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
new file mode 100644
index 0000000000000..6397b7fe525b8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
+
+#block = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>
+#slice = #xegpu.slice<#block, dims=[1]>
+
+//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
+gpu.module @test_1_1_assignment {
+ gpu.func @create_nd_tdesc() -> vector<128xindex> {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[c32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+ //CHECK: [[c128:%.+]] = arith.constant 128 : index
+ //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+ //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+ //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+ %step = vector.step {layout_result_0 = #slice}: vector<128xindex>
+ gpu.return %step : vector<128xindex>
+ }
+}
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index f71fcf7ca297b..1e96280769060 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -7,11 +7,14 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
using namespace mlir;
@@ -149,12 +152,116 @@ struct TestXeGPUUnrollingPatterns
}
};
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "test-xegpu-layout-interface"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
+ using OpConversionPattern<vector::StepOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+
+ auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+ if (!sliceAttr || sliceAttr.getRank() != 1)
+ return failure();
+
+ std::optional<SmallVector<int64_t>> sgShape =
+ sliceAttr.getEffectiveSgData();
+ if (!sgShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType type = op.getResult().getType();
+ auto wgShape = type.getShape();
+
+ Value sgId =
+ gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+ auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+ if (failed(maybeOffsets))
+ return failure();
+
+ VectorType newTy = type.cloneWith(*sgShape, type.getElementType());
+ Value base = vector::StepOp::create(rewriter, loc, newTy);
+ SmallVector<Value> newOps;
+ for (auto offsets : *maybeOffsets) {
+ Value bcast =
+ vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
+ Value add = arith::AddIOp::create(rewriter, loc, base, bcast);
+ newOps.push_back(add);
+ }
+ rewriter.replaceOpWithMultiple(op, {newOps});
+ return success();
+ }
+};
+
+struct TestXeGPULayoutInterface
+ : public PassWrapper<TestXeGPULayoutInterface,
+ OperationPass<gpu::GPUModuleOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPULayoutInterface)
+
+ StringRef getArgument() const final { return "test-xegpu-layout-interface"; }
+
+ StringRef getDescription() const final {
+ return "Test the implementation of XeGPU Layout interfaces";
+ }
+
+ void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
+ registry.insert<arith::ArithDialect>();
+ registry.insert<memref::MemRefDialect>();
+ registry.insert<xegpu::XeGPUDialect>();
+ registry.insert<vector::VectorDialect>();
+ registry.insert<index::IndexDialect>();
+ }
+
+ TestXeGPULayoutInterface() = default;
+ TestXeGPULayoutInterface(const TestXeGPULayoutInterface &pass)
+ : PassWrapper(pass) {}
+
+ void runOnOperation() override {
+ MLIRContext *ctx = &getContext();
+
+ TypeConverter typeConverter;
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+ .getResult(0);
+ };
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
+
+ RewritePatternSet patterns(ctx);
+ patterns.add<TestStepOpPattern>(typeConverter, ctx);
+
+ ConversionTarget target(*ctx);
+ auto isLegal = [&](xegpu::SliceAttr layout) -> bool {
+ return !layout || !layout.isWgLayout();
+ };
+
+ target.addDynamicallyLegalOp<vector::StepOp>(
+ [&](vector::StepOp op) -> bool {
+ auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+ return isLegal(sliceAttr);
+ });
+
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+ (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+ }
+};
+
} // namespace
namespace mlir {
namespace test {
void registerTestXeGPULowerings() {
PassRegistration<TestXeGPUUnrollingPatterns>();
+ PassRegistration<TestXeGPULayoutInterface>();
}
} // namespace test
} // namespace mlir
>From e7f2977e79bca34b5bf6fabda74d95d4c934fd7e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 19:24:09 +0000
Subject: [PATCH 16/29] fix a typo
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fad3c6280fbbe..835da3a52885e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -280,7 +280,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
});
SmallVector<Value> mods = llvm::map_to_vector(
- llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+ llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
return builder.createOrFold<index::RemUOp>(
loc, std::get<0>(t),
builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
@@ -374,7 +374,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
});
SmallVector<Value> mods = llvm::map_to_vector(
- llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+ llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
return builder.createOrFold<index::RemUOp>(
loc, std::get<0>(t),
builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
>From e3e4a618b65e7f6375d66d00d87ced9eac4b7629 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 22:50:59 +0000
Subject: [PATCH 17/29] add unit test
---
mlir/test/Dialect/XeGPU/layout.mlir | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index e5330951b065a..af13f69ab2d8a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -56,4 +56,15 @@ gpu.func @slice_attr_repeat_dim() {
gpu.return
}
+gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
+ %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
+ %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+ //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+ %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+ %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+ gpu.return %3 : vector<256x128xf32>
+}
+
}
>From 3f59105caa7a2b07055d98e5d503a4bbc348d1d4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 4 Aug 2025 16:22:04 +0000
Subject: [PATCH 18/29] fix conflicts
---
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 38 ++++++++++-----------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index df781b951f4f1..180ba8a162c9f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -6,26 +6,24 @@ gpu.module @test_1_1_assignment {
// CHECK-LABEL: create_nd_tdesc
// CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
- //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
- //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
- //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
- //CHECK: [[C12:%.+]] = arith.constant 12 : index
- //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C12]]
- //CHECK: [[C8:%.+]] = arith.constant 8 : index
- //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C8]]
- //CHECK: [[C0:%.+]] = arith.constant 0 : index
- //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
- //CHECK: [[C24:%.+]] = arith.constant 24 : index
- //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C24]]
- //CHECK: [[C32:%.+]] = arith.constant 32 : index
- //CHECK: [[X:%.+]] = index.remu [[UX]], [[C32]]
- //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<24x32xf32> -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
-
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
+ //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+ //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+ //CHECK: [[C32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
+ //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+ //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+ //CHECK: [[C256:%.+]] = arith.constant 256 : index
+ //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[C128:%.+]] = arith.constant 128 : index
+ //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
}
// CHECK-LABEL: load_nd_tdesc
>From 129312a92633e9ef702e282fb2ee139105706fce Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 4 Aug 2025 19:08:36 +0000
Subject: [PATCH 19/29] address comments
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 6 +++---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 10 +++++-----
mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt | 2 ++
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 +-
.../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 12 ++++++------
mlir/test/Dialect/XeGPU/layout.mlir | 10 +++++-----
6 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4f35e3ff061a4..364525444769b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -429,7 +429,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
if (auto layout = getParent().getEffectiveSgLayout()) {
ArrayRef<int64_t> dims = getDims().asArrayRef();
- return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*layout), dims);
+ return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
}
return std::nullopt;
}
@@ -437,7 +437,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
if (auto data = getParent().getEffectiveSgData()) {
ArrayRef<int64_t> dims = getDims().asArrayRef();
- return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*data), dims);
+ return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
}
return std::nullopt;
}
@@ -450,7 +450,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
}];
- let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
+ let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index f07a758a59b96..76d58e5ea2424 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -42,13 +42,13 @@ def XeGPU_Dialect : Dialect {
/// and data factors provided by the LayoutAttr.
static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
- /// drops the data in the specified dimension, and return the rest. e.g.,
- /// for data = [32, 64, 8], dropPositions = [0, 2], it will return [64]
+ /// drops/slices the shape in the specified dims, and return the rest. e.g.,
+ /// for shape = [32, 64, 8], dims = [0, 2], it will return [64]
template<typename T, typename U>
- static llvm::SmallVector<T> dropDims(llvm::ArrayRef<T> data, llvm::ArrayRef<U> dropPositions) {
+ static llvm::SmallVector<T> slice(llvm::ArrayRef<T> shape, llvm::ArrayRef<U> dims) {
llvm::SmallVector<T> result;
- for (auto [i, v]: llvm::enumerate(data)) {
- if (!llvm::is_contained(dropPositions, i))
+ for (auto [i, v]: llvm::enumerate(shape)) {
+ if (!llvm::is_contained(dims, i))
result.push_back(v);
}
return result;
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 89d986143e965..7c6a4f37db9af 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -13,6 +13,8 @@ add_mlir_dialect_library(MLIRXeGPUDialect
LINK_LIBS PUBLIC
MLIRArithDialect
+ MLIRIndexDialect
+ MLIRAffineUtils
MLIRArithUtils
MLIRDialectUtils
MLIRIR
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 835da3a52885e..502b45a8181e2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -350,7 +350,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
// to the dims that are not sliced.
ArrayRef<int64_t> dims = getDims().asArrayRef();
SmallVector<Value> sgIds =
- XeGPUDialect::dropDims(ArrayRef<Value>(*maybeIds), dims);
+ XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
// nd local offset, localOffset[i] = sgId[i] * sgShape[i]
SmallVector<Value> localOffsets = llvm::map_to_vector(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0a52f7769ea7a..b0600273b423c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -176,21 +176,21 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
layout.dropSgLayoutAndData());
SmallVector<Value> newCreateNdOps;
- SmallVector<OpFoldResult> oldOffsets = op.getMixedOffsets();
+ SmallVector<OpFoldResult> wgTileOffsets = op.getMixedOffsets();
for (auto tdescOffsets : *maybeTdescOffsets) {
- SmallVector<OpFoldResult> newOffsets;
+ SmallVector<OpFoldResult> sgTileOffsets;
size_t rank = tdescOffsets.size();
for (size_t i = 0; i < rank; i++) {
- size_t idx = oldOffsets.size() - rank + i;
+ size_t idx = wgTileOffsets.size() - rank + i;
Value add = rewriter.createOrFold<index::AddOp>(
loc, tdescOffsets[i],
- getValueOrCreateConstantIndexOp(rewriter, loc, oldOffsets[idx]));
- newOffsets.push_back(add);
+ getValueOrCreateConstantIndexOp(rewriter, loc, wgTileOffsets[idx]));
+ sgTileOffsets.push_back(add);
}
auto newOp = xegpu::CreateNdDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), newOffsets,
+ rewriter, loc, newTdescTy, op.getSource(), sgTileOffsets,
op.getMixedSizes(), op.getMixedStrides());
newCreateNdOps.push_back(newOp);
}
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index af13f69ab2d8a..ac0670cf63f94 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,17 +50,17 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
gpu.return
}
-gpu.func @slice_attr_repeat_dim() {
- //CHECK: arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
- %cst = arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+gpu.func @slice_attr() {
+ //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
gpu.return
}
gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
%cst = arith.constant dense<0.000000e+00> : vector<128xf32>
%0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
- //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
- %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+ %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
//CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
%2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
%3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
>From 0865612c7899dae0c14febc31e168f8a07a73408 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:13:14 +0000
Subject: [PATCH 20/29] add support for nested SliceAttr
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 +
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 36 +++++++++++++----
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 39 +++++++++++++++++--
mlir/test/Dialect/XeGPU/layout.mlir | 6 +++
4 files changed, 71 insertions(+), 11 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index eb74b8142688f..3592da4c46364 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -24,6 +24,7 @@ namespace mlir {
namespace xegpu {
class TensorDescType;
class LayoutAttr;
+class SliceAttr;
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 364525444769b..1cc3775998852 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -182,6 +182,9 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
}];
let methods = [
+ InterfaceMethod<"Get the rank of attribute",
+ "int64_t",
+ "getRank">,
InterfaceMethod<"Get the effective sg layout",
"std::optional<SmallVector<int64_t>>",
"getEffectiveSgLayout">,
@@ -192,7 +195,6 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
"FailureOr<SmallVector<Value>>",
"delinearizeSubgroupId",
(ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-
InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
"FailureOr<SmallVector<SmallVector<Value>>>",
"getOffsets",
@@ -404,30 +406,40 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
}];
let parameters = (ins
- "xegpu::LayoutAttr": $parent,
+ "xegpu::LayoutTrait": $parent,
"DenseI64ArrayAttr": $dims
);
let extraClassDeclaration = [{
int64_t getRank() const {
- return getParent().getRank() - getDims().size();
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ return parent.getRank() - attr.getDims().size();
}
DenseI32ArrayAttr getOrder() const {
- return getParent().getOrder();
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ return parent.getOrder();
}
bool isWgLayout() const {
- return getParent().isWgLayout();
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ return parent.isWgLayout();
}
bool isSgLayout() const {
- return getParent().isSgLayout();
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ return parent.isSgLayout();
}
std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
- if (auto layout = getParent().getEffectiveSgLayout()) {
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ if (auto layout = parent.getEffectiveSgLayout()) {
ArrayRef<int64_t> dims = getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
}
@@ -435,13 +447,21 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
}
std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
- if (auto data = getParent().getEffectiveSgData()) {
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ if (auto data = parent.getEffectiveSgData()) {
ArrayRef<int64_t> dims = getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
}
return std::nullopt;
}
+ /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
+ /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
+ /// it will coalese two slice operations and return a simplified SliceAttr
+ /// #xegpu.slice<#xegpu.layout<sg_laout = [4, 8, 12]>, dims = [0, 1]>
+ SliceAttr flatten() const;
+
FailureOr<SmallVector<Value>>
delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 502b45a8181e2..396e0d30d5974 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -15,6 +15,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
using std::optional;
@@ -297,11 +298,12 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
//===----------------------------------------------------------------------===//
LogicalResult
SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
- xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
+ xegpu::LayoutTrait parent, DenseI64ArrayAttr dims) {
if (!parent || !dims)
return emitError() << "expected parent layout and dims attribute";
- int rank = parent.getRank();
+ int64_t rank = parent.getRank();
+
// check every element in dims is unique and smaller than rank
llvm::SmallDenseSet<int64_t> seen;
for (int64_t dim : dims.asArrayRef()) {
@@ -313,10 +315,41 @@ SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
return success();
}
+SliceAttr SliceAttr::flatten() const {
+ xegpu::LayoutTrait parent = getParent();
+ SmallVector<DenseI64ArrayAttr> slicedDims({getDims()});
+
+ while (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(parent)) {
+ parent = sliceAttr.getParent();
+ slicedDims.push_back(sliceAttr.getDims());
+ }
+
+ auto layoutAttr = dyn_cast<xegpu::LayoutAttr>(parent);
+ SmallVector<int64_t> indices =
+ llvm::to_vector(llvm::seq<int64_t>(0, layoutAttr.getRank()));
+
+ // get remaining dims (flattend) by applying slice ops with all slicedDims
+ SmallVector<int64_t> remainingIndices(indices);
+ for (auto dim : llvm::reverse(slicedDims))
+ remainingIndices = XeGPUDialect::slice(
+ llvm::ArrayRef<int64_t>(remainingIndices), dim.asArrayRef());
+
+ // get flattend sliced dims by applying slice ops with the remaining dims
+ SmallVector<int64_t> flattendDims =
+ XeGPUDialect::slice(llvm::ArrayRef<int64_t>(indices),
+ llvm::ArrayRef<int64_t>(remainingIndices));
+
+ return xegpu::SliceAttr::get(
+ getContext(), layoutAttr,
+ DenseI64ArrayAttr::get(getContext(), flattendDims));
+}
+
FailureOr<SmallVector<Value>>
SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
Value linearId) {
- return getParent().delinearizeSubgroupId(builder, loc, linearId);
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ return parent.delinearizeSubgroupId(builder, loc, linearId);
}
FailureOr<SmallVector<SmallVector<Value>>>
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index ac0670cf63f94..e4b4e22e5cf97 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -56,6 +56,12 @@ gpu.func @slice_attr() {
gpu.return
}
+gpu.func @nested_slice_attr() {
+ //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+ gpu.return
+}
+
gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
%cst = arith.constant dense<0.000000e+00> : vector<128xf32>
%0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
>From b67f2b193bd464a7a666a47fe0e0227a35c24b8e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:30:47 +0000
Subject: [PATCH 21/29] add unit test for nested slice attr
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 6 ++---
.../Dialect/XeGPU/xegpu-attr-interface.mlir | 26 ++++++++++++++-----
2 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 1cc3775998852..17ea8b09bb26e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -440,7 +440,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
if (auto layout = parent.getEffectiveSgLayout()) {
- ArrayRef<int64_t> dims = getDims().asArrayRef();
+ ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
}
return std::nullopt;
@@ -450,7 +450,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
if (auto data = parent.getEffectiveSgData()) {
- ArrayRef<int64_t> dims = getDims().asArrayRef();
+ ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
}
return std::nullopt;
@@ -459,7 +459,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
/// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
/// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
/// it will coalese two slice operations and return a simplified SliceAttr
- /// #xegpu.slice<#xegpu.layout<sg_laout = [4, 8, 12]>, dims = [0, 1]>
+ /// #xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0, 1]>
SliceAttr flatten() const;
FailureOr<SmallVector<Value>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index 6397b7fe525b8..547c7355e00c6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -1,11 +1,8 @@
// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
-#block = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>
-#slice = #xegpu.slice<#block, dims=[1]>
-
//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
-gpu.module @test_1_1_assignment {
- gpu.func @create_nd_tdesc() -> vector<128xindex> {
+gpu.module @test {
+ gpu.func @slice_attr() -> vector<128xindex> {
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
@@ -17,7 +14,24 @@ gpu.module @test_1_1_assignment {
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
- %step = vector.step {layout_result_0 = #slice}: vector<128xindex>
+ %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
gpu.return %step : vector<128xindex>
}
+
+ gpu.func @nested_slice_attr() -> vector<128xindex> {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[c32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+ //CHECK: [[c128:%.+]] = arith.constant 128 : index
+ //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+ //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+ //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+ %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
+ gpu.return %0 : vector<128xindex>
+ }
+
}
\ No newline at end of file
>From 01e4efe315015c2440206b169ce9b4e2366ce2f1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:59:43 +0000
Subject: [PATCH 22/29] cleanup
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 134 +++++++++------------
1 file changed, 54 insertions(+), 80 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 396e0d30d5974..77c06c2f65da9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,6 +37,54 @@ void XeGPUDialect::initialize() {
>();
}
+// generate offsets computing instructions for a subgroup
+// represented by a nd indices (sgId), given the subgroup layout (sgLayout),
+// the subgroup data size (sgShape), and the overall data size (shape)
+static SmallVector<SmallVector<Value>>
+genOffsetsComputations(OpBuilder &builder, Location loc,
+ SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
+ ArrayRef<int64_t> sgShape, ArrayRef<int64_t> shape) {
+
+ SmallVector<SmallVector<Value>> offsets;
+
+ // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+ SmallVector<Value> localOffsets = llvm::map_to_vector(
+ llvm::zip(sgId, sgShape), [&](const auto &t) -> Value {
+ return builder.createOrFold<index::MulOp>(
+ loc, std::get<0>(t),
+ builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+ });
+
+ // distUnit[i] is the minimum value between shape[i] and
+ // sgLayout[i] * sgShape[i]
+ SmallVector<int64_t> distUnit = llvm::map_to_vector(
+ llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+ [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+ for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+ SmallVector<Value> base =
+ llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+ return builder.create<arith::ConstantIndexOp>(loc, d);
+ });
+
+ SmallVector<Value> adds = llvm::map_to_vector(
+ llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+ return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+ std::get<1>(t));
+ });
+
+ SmallVector<Value> mods = llvm::map_to_vector(
+ llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
+ return builder.createOrFold<index::RemUOp>(
+ loc, std::get<0>(t),
+ builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+ });
+
+ offsets.push_back(mods);
+ }
+ return offsets;
+}
+
// Checks if the given shape can be evenly distributed based on the layout
// and data factors provided by the LayoutAttr.
bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
@@ -238,7 +286,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
if (!isWgLayout())
return failure();
- auto sgLayout = getEffectiveSgLayout().value();
+ SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
SmallVector<int64_t> sgShape;
if (auto maybeSgShape = getEffectiveSgData())
sgShape = maybeSgShape.value();
@@ -247,50 +295,13 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
else
return failure();
- // distUnit[i] is the minimum value between shape[i] and
- // sgLayout[i] * sgShape[i]
- SmallVector<int64_t> distUnit = llvm::map_to_vector(
- llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
- [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
// delinearize Ids
auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
if (failed(maybeIds))
return failure();
SmallVector<Value> sgIds = *maybeIds;
- // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
- SmallVector<Value> localOffsets = llvm::map_to_vector(
- llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
- return builder.createOrFold<index::MulOp>(
- loc, std::get<0>(t),
- builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
- });
-
- SmallVector<SmallVector<Value>> offsets;
- for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
- SmallVector<Value> base =
- llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
- return builder.create<arith::ConstantIndexOp>(loc, d);
- });
-
- SmallVector<Value> adds = llvm::map_to_vector(
- llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
- return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
- std::get<1>(t));
- });
-
- SmallVector<Value> mods = llvm::map_to_vector(
- llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
- return builder.createOrFold<index::RemUOp>(
- loc, std::get<0>(t),
- builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
- });
-
- offsets.push_back(mods);
- }
-
- return offsets;
+ return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
}
//===----------------------------------------------------------------------===//
@@ -359,8 +370,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
if (!isWgLayout())
return failure();
- auto sgLayout = getEffectiveSgLayout().value();
-
+ SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
SmallVector<int64_t> sgShape;
if (auto maybeSgShape = getEffectiveSgData())
sgShape = maybeSgShape.value();
@@ -369,54 +379,18 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
else
return failure();
- // distUnit[i] is the minimum value between shape[i] and
- // sgLayout[i] * sgShape[i]
- SmallVector<int64_t> distUnit = llvm::map_to_vector(
- llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
- [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
// delinearize Ids
auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
if (failed(maybeIds))
return failure();
+
// The effective sgIds for offsets computing correspond
// to the dims that are not sliced.
- ArrayRef<int64_t> dims = getDims().asArrayRef();
+ ArrayRef<int64_t> dims = flatten().getDims().asArrayRef();
SmallVector<Value> sgIds =
XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
- // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
- SmallVector<Value> localOffsets = llvm::map_to_vector(
- llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
- return builder.createOrFold<index::MulOp>(
- loc, std::get<0>(t),
- builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
- });
-
- SmallVector<SmallVector<Value>> offsets;
- for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
- SmallVector<Value> base =
- llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
- return builder.create<arith::ConstantIndexOp>(loc, d);
- });
-
- SmallVector<Value> adds = llvm::map_to_vector(
- llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
- return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
- std::get<1>(t));
- });
-
- SmallVector<Value> mods = llvm::map_to_vector(
- llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
- return builder.createOrFold<index::RemUOp>(
- loc, std::get<0>(t),
- builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
- });
-
- offsets.push_back(mods);
- }
-
- return offsets;
+ return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
}
//===----------------------------------------------------------------------===//
>From 3077c6c83632737d10493edbec7c5919cdd6af91 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 16:37:19 -0500
Subject: [PATCH 23/29] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
Co-authored-by: Charitha Saumya <136391709+charithaintc at users.noreply.github.com>
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 17ea8b09bb26e..bd162e98557f4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -390,7 +390,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
However, whereas LayoutAttr requires the data to have the same rank as the attribute,
SliceAttr permits the data to have a lower rank. In this case, compute units in the
- specified dimensions share the data, provided that the remaining ranks match the data
+ specified dimensions (given by `$dims`) share the data, provided that the remaining ranks match the data
rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
vector.broadcast.
>From d1f7bac594173ffa1a37ff034f9b417da87748ee Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 16:00:16 +0000
Subject: [PATCH 24/29] update docs
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 56 +++++++++++++------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 28 ++++++----
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 6 +-
3 files changed, 60 insertions(+), 30 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bd162e98557f4..1f420c13ebae0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -185,17 +185,21 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
InterfaceMethod<"Get the rank of attribute",
"int64_t",
"getRank">,
- InterfaceMethod<"Get the effective sg layout",
+ InterfaceMethod<"Get the SgLayout field of the attribute as integer array",
"std::optional<SmallVector<int64_t>>",
- "getEffectiveSgLayout">,
- InterfaceMethod<"Get the effective sg data",
+ "getSgLayoutAsInt">,
+ InterfaceMethod<"Get the SgData field of the attribute as integer array",
"std::optional<SmallVector<int64_t>>",
- "getEffectiveSgData">,
- InterfaceMethod<"Delinearize the Subgroup Id",
+ "getSgDataAsInt">,
+ InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional
+ indices based on the effective subgroup layout.}],
"FailureOr<SmallVector<Value>>",
"delinearizeSubgroupId",
(ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
- InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
+ InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks
+ assigned to a subgroup identified by linearId. The shape parameter
+ represents the workgroup-level problem size. Each subgroup may access
+ multiple blocks according to round-robin distribution rules.}],
"FailureOr<SmallVector<SmallVector<Value>>>",
"getOffsets",
(ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
@@ -358,21 +362,27 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
getLaneLayout(), getLaneData(), getOrder());
}
- std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+ std::optional<SmallVector<int64_t>> getSgLayoutAsInt() const {
if (DenseI32ArrayAttr layout = getSgLayout())
return llvm::to_vector_of<int64_t>(layout.asArrayRef());
return std::nullopt;
}
- std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+ std::optional<SmallVector<int64_t>> getSgDataAsInt() const {
if (DenseI32ArrayAttr data = getSgData())
return llvm::to_vector_of<int64_t>(data.asArrayRef());
return std::nullopt;
}
+ /// Delinearizes a linear subgroup ID into its multidimensional indices
+ /// based on the effective subgroup layout.
FailureOr<SmallVector<Value>>
delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+ /// Generates instructions to compute multidimensional offsets for blocks
+ /// assigned to a subgroup identified by linearId. The shape parameter
+ /// represents the workgroup-level problem size. Each subgroup may access
+ /// multiple blocks according to round-robin distribution rules.
FailureOr<SmallVector<SmallVector<Value>>>
getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
@@ -390,19 +400,23 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
However, whereas LayoutAttr requires the data to have the same rank as the attribute,
SliceAttr permits the data to have a lower rank. In this case, compute units in the
- specified dimensions (given by `$dims`) share the data, provided that the remaining ranks match the data
- rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
- vector.broadcast.
+ specified dimensions (given by `$dims`) share the data, provided that the remaining
+ ranks match the data rank. SliceAttr is commonly used by operations such as
+ vector.multi_reduction and vector.broadcast.
Example:
```
#l = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
- #r = #xegpu.slice<#l, dim = 0>
+ #r = #xegpu.slice<#l, dim = [0]>
%exp = math.exp %input {layout_result_0 = #l}: vector<256x128xf32>
%red = vector.multi_reduction<add>, %exp, %acc [0] {layout_result_0 = #r}: vector<256x128xf32> to vector<128xf32>
%bcast = vector.broadcast %red {layout_result_0 = #l} : vector<128xf32> to vector<256x128xf32>
```
+ In this example, %red is conceptually divided into 4 vectors of type vector<32xf32>, each assigned to
+ a group of subgroups. Each group consists of 8 subgroups from the same column of sg_layout, sharing a
+ single reduction result of type vector<32xf32>.
+
}];
let parameters = (ins
@@ -436,20 +450,24 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
return parent.isSgLayout();
}
- std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+ /// Returns the SgLayout of the attribute, computed by applying
+ /// the slice dimensions to the underlying LayoutAttr.
+ std::optional<SmallVector<int64_t>> getSgLayoutAsInt() const {
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
- if (auto layout = parent.getEffectiveSgLayout()) {
+ if (auto layout = parent.getSgLayoutAsInt()) {
ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
}
return std::nullopt;
}
- std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+ /// Returns the SgData of the attribute, computed by applying
+ /// the slice dimensions to the underlying LayoutAttr.
+ std::optional<SmallVector<int64_t>> getSgDataAsInt() const {
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
- if (auto data = parent.getEffectiveSgData()) {
+ if (auto data = parent.getSgDataAsInt()) {
ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
}
@@ -462,9 +480,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
/// #xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0, 1]>
SliceAttr flatten() const;
+ /// Delinearizes a linear subgroup ID into its multidimensional indices
+ /// based on the effective subgroup layout.
FailureOr<SmallVector<Value>>
delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+ /// Generates instructions to compute multidimensional offsets for blocks
+ /// assigned to a subgroup identified by linearId. The shape parameter
+ /// represents the workgroup-level problem size. Each subgroup may access
+ /// multiple blocks according to round-robin distribution rules.
FailureOr<SmallVector<SmallVector<Value>>>
getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 77c06c2f65da9..25ff7cba92a83 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,9 +37,10 @@ void XeGPUDialect::initialize() {
>();
}
-// generate offsets computing instructions for a subgroup
-// represented by a nd indices (sgId), given the subgroup layout (sgLayout),
-// the subgroup data size (sgShape), and the overall data size (shape)
+/// Generates instructions to compute offsets for a subgroup identified by
+/// its multidimensional indices (sgId), using the specified subgroup layout
+/// (sgLayout), subgroup data dimensions (sgShape), and the overall data
+/// dimensions (shape).
static SmallVector<SmallVector<Value>>
genOffsetsComputations(OpBuilder &builder, Location loc,
SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
@@ -272,23 +273,24 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
return failure();
// TODO: handle order attribute
- auto dims =
- llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
- return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
- });
+ auto dims = llvm::map_to_vector(*getSgLayoutAsInt(), [&](int64_t d) -> Value {
+ return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
+ });
return affine::delinearizeIndex(builder, loc, linearId, dims);
}
+/// Implements LayoutTrait::getOffsets to generate instructions for
+/// computing multi-dimensional offsets when distributed by LayoutAttr.
FailureOr<SmallVector<SmallVector<Value>>>
LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
ArrayRef<int64_t> shape) {
if (!isWgLayout())
return failure();
- SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
+ SmallVector<int64_t> sgLayout = getSgLayoutAsInt().value();
SmallVector<int64_t> sgShape;
- if (auto maybeSgShape = getEffectiveSgData())
+ if (auto maybeSgShape = getSgDataAsInt())
sgShape = maybeSgShape.value();
else if (auto ratio = computeShapeRatio(shape, sgLayout))
sgShape = ratio.value();
@@ -318,7 +320,7 @@ SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
// check every element in dims is unique and smaller than rank
llvm::SmallDenseSet<int64_t> seen;
for (int64_t dim : dims.asArrayRef()) {
- if (dim >= rank)
+ if (dim < 0 || dim >= rank)
return emitError() << "invalid dim (" << dim << ") in slice attribute.";
if (!seen.insert(dim).second)
return emitError() << "repeated dim (" << dim << ") in slice attribute.";
@@ -363,6 +365,8 @@ SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
return parent.delinearizeSubgroupId(builder, loc, linearId);
}
+/// Implements LayoutTrait::getOffsets to generate instructions for
+/// computing multi-dimensional offsets when distributed by SliceAttr.
FailureOr<SmallVector<SmallVector<Value>>>
SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
ArrayRef<int64_t> shape) {
@@ -370,9 +374,9 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
if (!isWgLayout())
return failure();
- SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
+ SmallVector<int64_t> sgLayout = getSgLayoutAsInt().value();
SmallVector<int64_t> sgShape;
- if (auto maybeSgShape = getEffectiveSgData())
+ if (auto maybeSgShape = getSgDataAsInt())
sgShape = maybeSgShape.value();
else if (auto ratio = computeShapeRatio(shape, sgLayout))
sgShape = ratio.value();
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 4cd662f0f6980..3bea8efcdb0ae 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -155,6 +155,9 @@ struct TestXeGPUUnrollingPatterns
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+// Test pattern for distributing vector::StepOp from workgroup to subgroup.
+// Validates LayoutTrait interfaces for offset computation abstraction between
+// LayoutAttr and SliceAttr.
class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
using OpConversionPattern<vector::StepOp>::OpConversionPattern;
@@ -167,8 +170,7 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
if (!sliceAttr || sliceAttr.getRank() != 1)
return failure();
- std::optional<SmallVector<int64_t>> sgShape =
- sliceAttr.getEffectiveSgData();
+ std::optional<SmallVector<int64_t>> sgShape = sliceAttr.getSgDataAsInt();
if (!sgShape)
return failure();
>From 27da02a9ba57d19aac0c070aedfe5b630350dfff Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 17:45:07 +0000
Subject: [PATCH 25/29] add check for order attribute
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 25ff7cba92a83..e9c6a8eed3dfb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -273,6 +273,14 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
return failure();
// TODO: handle order attribute
+ auto hasDefaultOrder = [&]() {
+ DenseI32ArrayAttr order = getOrder();
+ return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
+ llvm::reverse(order.asArrayRef())));
+ };
+ if (!hasDefaultOrder())
+ return mlir::emitError(loc, "order attribute is currently not supported.");
+
auto dims = llvm::map_to_vector(*getSgLayoutAsInt(), [&](int64_t d) -> Value {
return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
});
>From e49e1cf52ce5fe7ef67a99d429ef35d28f51ab12 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 17:49:33 +0000
Subject: [PATCH 26/29] clean up
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index e9c6a8eed3dfb..78b3cbdedecf8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -350,15 +350,14 @@ SliceAttr SliceAttr::flatten() const {
llvm::to_vector(llvm::seq<int64_t>(0, layoutAttr.getRank()));
// get remaining dims (flattend) by applying slice ops with all slicedDims
- SmallVector<int64_t> remainingIndices(indices);
+ SmallVector<int64_t> remainingDims(indices);
for (auto dim : llvm::reverse(slicedDims))
- remainingIndices = XeGPUDialect::slice(
- llvm::ArrayRef<int64_t>(remainingIndices), dim.asArrayRef());
+ remainingDims = XeGPUDialect::slice(llvm::ArrayRef<int64_t>(remainingDims),
+ dim.asArrayRef());
// get flattend sliced dims by applying slice ops with the remaining dims
- SmallVector<int64_t> flattendDims =
- XeGPUDialect::slice(llvm::ArrayRef<int64_t>(indices),
- llvm::ArrayRef<int64_t>(remainingIndices));
+ SmallVector<int64_t> flattendDims = XeGPUDialect::slice(
+ llvm::ArrayRef<int64_t>(indices), llvm::ArrayRef<int64_t>(remainingDims));
return xegpu::SliceAttr::get(
getContext(), layoutAttr,
>From 59de4502a82af59202a952ce56635f967fcbd1a1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 19:19:56 +0000
Subject: [PATCH 27/29] clean up
---
mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt | 2 +-
.../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 12 ++++++------
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index bbbeb71410a9b..728f1aa859061 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -17,4 +17,4 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
add_public_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
-add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
\ No newline at end of file
+add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index b0600273b423c..4a5525c8abb30 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -176,21 +176,21 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
layout.dropSgLayoutAndData());
SmallVector<Value> newCreateNdOps;
- SmallVector<OpFoldResult> wgTileOffsets = op.getMixedOffsets();
+ SmallVector<OpFoldResult> wgOffsets = op.getMixedOffsets();
for (auto tdescOffsets : *maybeTdescOffsets) {
- SmallVector<OpFoldResult> sgTileOffsets;
+ SmallVector<OpFoldResult> sgOffsets;
size_t rank = tdescOffsets.size();
for (size_t i = 0; i < rank; i++) {
- size_t idx = wgTileOffsets.size() - rank + i;
+ size_t idx = wgOffsets.size() - rank + i;
Value add = rewriter.createOrFold<index::AddOp>(
loc, tdescOffsets[i],
- getValueOrCreateConstantIndexOp(rewriter, loc, wgTileOffsets[idx]));
- sgTileOffsets.push_back(add);
+ getValueOrCreateConstantIndexOp(rewriter, loc, wgOffsets[idx]));
+ sgOffsets.push_back(add);
}
auto newOp = xegpu::CreateNdDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), sgTileOffsets,
+ rewriter, loc, newTdescTy, op.getSource(), sgOffsets,
op.getMixedSizes(), op.getMixedStrides());
newCreateNdOps.push_back(newOp);
}
>From 1b165521b29e4b595182160f5ffd94340f653c2c Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Aug 2025 15:07:01 +0000
Subject: [PATCH 28/29] address comments
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 36 ++++++++++---------
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 27 ++++++++++++++
2 files changed, 47 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78b3cbdedecf8..35fbe2edd2b2c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -39,30 +39,32 @@ void XeGPUDialect::initialize() {
/// Generates instructions to compute offsets for a subgroup identified by
/// its multidimensional indices (sgId), using the specified subgroup layout
-/// (sgLayout), subgroup data dimensions (sgShape), and the overall data
-/// dimensions (shape).
+/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data
+/// dimensions (sizePerWg).
static SmallVector<SmallVector<Value>>
-genOffsetsComputations(OpBuilder &builder, Location loc,
- SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
- ArrayRef<int64_t> sgShape, ArrayRef<int64_t> shape) {
+genOffsetsComputingInsts(OpBuilder &builder, Location loc,
+ SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
+ ArrayRef<int64_t> sizePerSg,
+ ArrayRef<int64_t> sizePerWg) {
SmallVector<SmallVector<Value>> offsets;
- // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+ // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i]
SmallVector<Value> localOffsets = llvm::map_to_vector(
- llvm::zip(sgId, sgShape), [&](const auto &t) -> Value {
+ llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value {
return builder.createOrFold<index::MulOp>(
loc, std::get<0>(t),
builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
});
- // distUnit[i] is the minimum value between shape[i] and
- // sgLayout[i] * sgShape[i]
+ // distUnit[i] is the minimum value between sizePerWg[i] and
+ // sgLayout[i] * sizePerSg[i]
SmallVector<int64_t> distUnit = llvm::map_to_vector(
- llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+ llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)),
[](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
- for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+ for (SmallVector<int64_t> unitOffs :
+ StaticTileOffsetRange(sizePerWg, distUnit)) {
SmallVector<Value> base =
llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
return builder.create<arith::ConstantIndexOp>(loc, d);
@@ -75,7 +77,7 @@ genOffsetsComputations(OpBuilder &builder, Location loc,
});
SmallVector<Value> mods = llvm::map_to_vector(
- llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
+ llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value {
return builder.createOrFold<index::RemUOp>(
loc, std::get<0>(t),
builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
@@ -300,8 +302,8 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
SmallVector<int64_t> sgShape;
if (auto maybeSgShape = getSgDataAsInt())
sgShape = maybeSgShape.value();
- else if (auto ratio = computeShapeRatio(shape, sgLayout))
- sgShape = ratio.value();
+ else if (auto derivedShape = computeShapeRatio(shape, sgLayout))
+ sgShape = derivedShape.value();
else
return failure();
@@ -311,7 +313,8 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
return failure();
SmallVector<Value> sgIds = *maybeIds;
- return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
+ return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
+ shape);
}
//===----------------------------------------------------------------------===//
@@ -401,7 +404,8 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
SmallVector<Value> sgIds =
XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
- return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
+ return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
+ shape);
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 628a4857d1253..fadd4aeba9bec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,5 +1,8 @@
// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+#map = affine_map<()[s0] -> (s0 floordiv 4)>
+#map1 = affine_map<()[s0] -> (s0 mod 4)>
+
gpu.module @test_round_robin_assignment {
// CHECK-LABEL: create_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -12,6 +15,30 @@ gpu.module @test_round_robin_assignment {
gpu.return
}
+ // CHECK-LABEL: create_nd_tdesc_with_shared_data
+ // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+ gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
+ //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
+ //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
+ //CHECK: [[C16:%.+]] = arith.constant 16 : index
+ //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
+ //CHECK: [[C64:%.+]] = arith.constant 64 : index
+ //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+ //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+ //CHECK: [[C128:%.+]] = arith.constant 128 : index
+ //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+ //CHECK: [[C128_2:%.+]] = arith.constant 128 : index
+ //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C128_2]]
+ //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
+ gpu.return
+ }
+
// CHECK-LABEL: load_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
>From 0511e1bfa4b7c3f206655b55f7bbca3368837576 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Aug 2025 16:03:35 +0000
Subject: [PATCH 29/29] cleanup
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 35fbe2edd2b2c..d997296a22c20 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -388,8 +388,8 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
SmallVector<int64_t> sgShape;
if (auto maybeSgShape = getSgDataAsInt())
sgShape = maybeSgShape.value();
- else if (auto ratio = computeShapeRatio(shape, sgLayout))
- sgShape = ratio.value();
+ else if (auto derivedShape = computeShapeRatio(shape, sgLayout))
+ sgShape = derivedShape.value();
else
return failure();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index fadd4aeba9bec..e5cc65e6bd3d7 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -31,11 +31,11 @@ gpu.module @test_round_robin_assignment {
//CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
//CHECK: [[C128:%.+]] = arith.constant 128 : index
//CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
- //CHECK: [[C128_2:%.+]] = arith.constant 128 : index
- //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C128_2]]
+ //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
+ //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
//CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
+ -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
gpu.return
}
More information about the Mlir-commits
mailing list