[Mlir-commits] [mlir] [mlir][xegpu] Add definition of SliceAttr (PR #150146)

Fri Aug 8 09:04:03 PDT 2025

https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/150146

>From 2bc70b6a8487a8ce0f0e7e0c5ac5bc59035465ab Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 19:46:04 +0000
Subject: [PATCH 01/29] add definition draft of SliceAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 42b5b7a0d4e3f..abbd227b9905f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -330,4 +330,25 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let genVerifyDecl = 1;
 }
 
+
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+  let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
+
+  let description = [{
+    Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
+    However, whereas LayoutAttr requires the data to have the same rank as the attribute,
+    SliceAttr permits the data to have a lower rank. In this case, compute units in the
+    specified dimensions share the data, provided that the remaining ranks match the data
+    rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
+    vector.broadcast.
+  }];
+
+  let parameters = (ins
+    "Attribute": $parent,
+    "DenseI64ArrayAttr": $dims
+  );
+
+  let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

>From 3959f9e5027f7c21f420c44a5e34501c115df361 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 21:02:22 +0000
Subject: [PATCH 02/29] add layout traits

---
 mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt |  6 ++++++
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h        |  1 +
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td  | 11 +++++++++--
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt          |  1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp            |  1 +
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index 3f8cac4dc07c3..bbbeb71410a9b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -12,3 +12,9 @@ mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
 mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRXeGPUEnumsIncGen)
 add_dependencies(mlir-headers MLIRXeGPUEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
+mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
+mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
+add_public_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
+add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 8e2784f40ad39..cc8d58d8975b4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -25,6 +25,7 @@ class TensorDescType;
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index abbd227b9905f..b15dd4a3177f9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -169,7 +169,14 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
+def LayoutTrait: AttrInterface<"LayoutTrait"> {
+  let cppNamespace = "::mlir::xegpu";
+  let description = [{
+    Common trait for all XeGPU layouts.
+  }];
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
   let summary = [{
     Describes the data distribution to subgroups and work-items for a tensor
     specified by the tensor descriptor.
@@ -331,7 +338,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
 }
 
 
-def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
 
   let description = [{
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 242a97ccfdf6d..89d986143e965 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
 
   DEPENDS
   MLIRXeGPUIncGen
+  MLIRXeGPUAttrInterfaceIncGen
   MLIRXeGPUAttrsIncGen
   MLIRXeGPUEnumsIncGen
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 78cbf884a1911..63160c98105c3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -753,6 +753,7 @@ LogicalResult ConvertLayoutOp::verify() {
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>

>From 2027cfc98321d8f68a713340cd652ab10625cfee Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 23:46:10 +0000
Subject: [PATCH 03/29] add verifier and interface

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 54 ++++++++++++++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 21 ++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index b15dd4a3177f9..e3b06714bdcc2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -174,6 +174,17 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
   let description = [{
     Common trait for all XeGPU layouts.
   }];
+
+  let methods = [
+    InterfaceMethod<"Get the effective sg layout",
+                    "std::optional<llvm::SmallVector<int>>",
+                    "getEffectiveSgLayout">,
+    InterfaceMethod<"Get the effective sg data",
+                    "std::optional<llvm::SmallVector<int>>",
+                    "getEffectiveSgData">,
+  ];
+
+
 }
 
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -331,6 +342,18 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
       return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr,
                              getLaneLayout(), getLaneData(), getOrder());
     }
+
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+      if (DenseI32ArrayAttr layout = getSgLayout())
+        return llvm::to_vector(layout.asArrayRef());
+      return std::nullopt;
+    }
+
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+      if (DenseI32ArrayAttr data = getSgData())
+        return llvm::to_vector(data.asArrayRef());
+      return std::nullopt;
+    }
   }];
 
   let assemblyFormat = "`<` struct(params) `>`";
@@ -351,11 +374,40 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   }];
 
   let parameters = (ins
-    "Attribute": $parent,
+    "xegpu::LayoutAttr": $parent,
     "DenseI64ArrayAttr": $dims
   );
 
+  let extraClassDeclaration = [{
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+      if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
+        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+        llvm::SmallVector<int32_t> result;
+        for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
+          if (!llvm::is_contained(dims, i))
+            result.push_back(v);
+        }
+        return result;
+      }
+      return std::nullopt;
+    }
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+      if (DenseI32ArrayAttr data = getParent().getSgData()) {
+        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+        llvm::SmallVector<int32_t> result;
+        for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
+          if (!llvm::is_contained(dims, i))
+            result.push_back(v);
+        }
+        return result;
+      }
+      return std::nullopt;
+
+    }
+  }];
+
   let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+  let genVerifyDecl = 1;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 642c393cbc2c8..7e293b6f0e1a3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -206,6 +206,27 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_SliceAttr
+//===----------------------------------------------------------------------===//
+LogicalResult
+SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+                  xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
+  if (!parent || !dims)
+    return emitError() << "expected parent layout and dims attribute";
+
+  int rank = parent.getRank();
+  // check every element in dims is unique and smaller than rank
+  llvm::SmallDenseSet<int64_t> seen;
+  for (int64_t dim : dims.asArrayRef()) {
+    if (dim >= rank)
+      return emitError() << "invalid dim: " << dim;
+    if (!seen.insert(dim).second)
+      return emitError() << "repeated dim: " << dim;
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//

>From 638c0853dc2b76fbc01d8410cd6bb52aa7d20891 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 15:52:26 +0000
Subject: [PATCH 04/29] add invalid unit test

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  2 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  4 ++--
 mlir/test/Dialect/XeGPU/invalid.mlir          | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index e3b06714bdcc2..d0b2e936d6508 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -406,7 +406,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     }
   }];
 
-  let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+  let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7e293b6f0e1a3..21007f98643bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -220,9 +220,9 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   llvm::SmallDenseSet<int64_t> seen;
   for (int64_t dim : dims.asArrayRef()) {
     if (dim >= rank)
-      return emitError() << "invalid dim: " << dim;
+      return emitError() << "invalid dim (" << dim << ") in slice attribute.";
     if (!seen.insert(dim).second)
-      return emitError() << "repeated dim: " << dim;
+      return emitError() << "repeated dim (" << dim << ") in slice attribute.";
   }
   return success();
 }
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index eb564d55bfd51..c4e72820e9aec 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -658,3 +658,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
         #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
   return
 }
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{repeated dim (2) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [2, 2]>
+func.func @slice_attr_repeat_dim() {
+  %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+  return
+}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{invalid dim (3) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [3]>
+func.func @slice_attr_repeat_dim() {
+  %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+  return
+}
+

>From 91048f06417bd8af3d58d35a516115da044e6451 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 16:06:59 +0000
Subject: [PATCH 05/29] add wrappers

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index d0b2e936d6508..a38878bc6a61f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,8 +183,6 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "std::optional<llvm::SmallVector<int>>",
                     "getEffectiveSgData">,
   ];
-
-
 }
 
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -402,7 +400,18 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
         return result;
       }
       return std::nullopt;
+    }
+
+    DenseI32ArrayAttr getOrder() const {
+      return getParent().getOrder();
+    }
+
+    bool isWgLayout() const {
+      return getParent().isWgLayout();
+    }
 
+    bool isSgLayout() const {
+      return getParent().isSgLayout();
     }
   }];
 

>From ddc42c2886ae3c49f10032caea27817dc6d542de Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 17:51:42 +0000
Subject: [PATCH 06/29] update description

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 78a7c48af837e..8644be8e4204c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,7 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
                     "std::optional<llvm::SmallVector<int>>",
-                    "getEffectiveSgData">,
+                    "getEffectiveSgData">
   ];
 }
 
@@ -375,6 +375,16 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     specified dimensions share the data, provided that the remaining ranks match the data
     rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
     vector.broadcast.
+
+    Example:
+    ```
+    #l = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+    #r = #xegpu.slice<#l, dim = 0>
+
+    %exp = math.exp %input {layout_result_0 = #l}: vector<256x128xf32>
+    %red = vector.multi_reduction<add>, %exp, %acc [0] {layout_result_0 = #r}: vector<256x128xf32> to vector<128xf32>
+    %bcast = vector.broadcast %red {layout_result_0 = #l} : vector<128xf32> to vector<256x128xf32>
+    ```
   }];
 
   let parameters = (ins

>From 36e2c3a118b0167c6e4f3341533f92353ddaebe2 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 18:44:08 +0000
Subject: [PATCH 07/29] refactor

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h        |  6 +++---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td  | 15 +++------------
 .../include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 12 ++++++++++++
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index cc8d58d8975b4..c2d546fa08fe0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -22,18 +22,18 @@
 namespace mlir {
 namespace xegpu {
 class TensorDescType;
+class LayoutAttr;
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
 #define GET_TYPEDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc>
-
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 8644be8e4204c..36a12a2c2a029 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -396,24 +396,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
       if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
         llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        llvm::SmallVector<int32_t> result;
-        for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
-          if (!llvm::is_contained(dims, i))
-            result.push_back(v);
-        }
-        return result;
+        return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
       }
       return std::nullopt;
     }
+
     std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
       if (DenseI32ArrayAttr data = getParent().getSgData()) {
         llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        llvm::SmallVector<int32_t> result;
-        for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
-          if (!llvm::is_contained(dims, i))
-            result.push_back(v);
-        }
-        return result;
+        return XeGPUDialect::dropDims(data.asArrayRef(), dims);
       }
       return std::nullopt;
     }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 549018b61d6fb..f07a758a59b96 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -41,6 +41,18 @@ def XeGPU_Dialect : Dialect {
       /// Checks if the given shape can be evenly distributed based on the layout
       /// and data factors provided by the LayoutAttr.
       static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
+
+      /// drops the data in the specified dimension, and return the rest. e.g.,
+      /// for data = [32, 64, 8], dropPositions = [0, 2], it will return [64]
+      template<typename T, typename U>
+      static llvm::SmallVector<T> dropDims(llvm::ArrayRef<T> data, llvm::ArrayRef<U> dropPositions) {
+        llvm::SmallVector<T> result;
+        for (auto [i, v]: llvm::enumerate(data)) {
+          if (!llvm::is_contained(dropPositions, i))
+            result.push_back(v);
+        }
+        return result;
+      }
     }];
 }
 

>From 6872e6dbda83d21d960ffb2c5156e89b1381fdfd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 20:26:39 +0000
Subject: [PATCH 08/29] add delinearizeSubgroupId interface

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |  1 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 13 ++++++++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 19 +++++++++++++++++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  2 ++
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  2 +-
 5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index c2d546fa08fe0..57919966a90b2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 36a12a2c2a029..96466550cb703 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,11 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
                     "std::optional<llvm::SmallVector<int>>",
-                    "getEffectiveSgData">
+                    "getEffectiveSgData">,
+    InterfaceMethod<"Delinearize the Subgroup Id",
+                    "FailureOr<SmallVector<Value>>",
+                    "delinearizeSubgroupId",
+                    (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
   ];
 }
 
@@ -358,6 +362,10 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
         return llvm::to_vector(data.asArrayRef());
       return std::nullopt;
     }
+
+    FailureOr<SmallVector<Value>>
+    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
   }];
 
   let assemblyFormat = "`<` struct(params) `>`";
@@ -409,6 +417,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
       return std::nullopt;
     }
 
+    FailureOr<llvm::SmallVector<Value>>
+    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
     }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 836478a807761..974e42140e54e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -211,6 +212,18 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+FailureOr<SmallVector<Value>>
+LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
+                                  OpBuilder &builder) {
+  assert(isWgLayout() && "delinearizeSubgroupId is only available for "
+                         "workgroup-level layout attribute.");
+  auto dims =
+      llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
+        return arith::ConstantIndexOp::create(builder, loc, d);
+      });
+  return affine::delinearizeIndex(builder, loc, linearId, dims);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -232,6 +245,12 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+FailureOr<SmallVector<Value>>
+SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
+                                 OpBuilder &builder) {
+  return getParent().delinearizeSubgroupId(linearId, loc, builder);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index edc18025136ac..a7013ed470cab 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -838,7 +838,9 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 } // namespace xegpu
 } // namespace mlir
 
+namespace mlir {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
+} // namespace mlir
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index ef52323a9f46b..2168d43eb701b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -175,7 +175,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     }
 
     auto deLinearizeSgId =
-        affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
+        layout.delinearizeSubgroupId(linearSgId, loc, rewriter);
     if (failed(deLinearizeSgId))
       return failure();
     SmallVector<Value> sgIds = *deLinearizeSgId;

>From 223fab912e9987e7a7ed7440fb6fd42b2d0a4dd8 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 21:05:46 +0000
Subject: [PATCH 09/29] fix format

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 57919966a90b2..eb74b8142688f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -27,8 +27,8 @@ class LayoutAttr;
 } // namespace xegpu
 } // namespace mlir
 
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 
 #define GET_ATTRDEF_CLASSES

>From 60e20a02b991a4276f74937ea69c483d780d2e49 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 24 Jul 2025 23:33:27 +0000
Subject: [PATCH 10/29] add impl of getOffsets for LayoutAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 65 +++++++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 80 +++++++++++++++++--
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  2 +-
 3 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 94a294fdc5705..5794f786dc9b9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,15 +183,20 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
 
   let methods = [
     InterfaceMethod<"Get the effective sg layout",
-                    "std::optional<llvm::SmallVector<int>>",
+                    "std::optional<SmallVector<int64_t>>",
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
-                    "std::optional<llvm::SmallVector<int>>",
+                    "std::optional<SmallVector<int64_t>>",
                     "getEffectiveSgData">,
     InterfaceMethod<"Delinearize the Subgroup Id",
                     "FailureOr<SmallVector<Value>>",
                     "delinearizeSubgroupId",
-                    (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
+                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
+
+    InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
+                    "FailureOr<SmallVector<SmallVector<Value>>>",
+                    "getOffsets",
+                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
   ];
 }
 
@@ -351,20 +356,23 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
       if (DenseI32ArrayAttr layout = getSgLayout())
-        return llvm::to_vector(layout.asArrayRef());
+        return llvm::to_vector_of<int64_t>(layout.asArrayRef());
       return std::nullopt;
     }
 
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
       if (DenseI32ArrayAttr data = getSgData())
-        return llvm::to_vector(data.asArrayRef());
+        return llvm::to_vector_of<int64_t>(data.asArrayRef());
       return std::nullopt;
     }
 
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+    FailureOr<SmallVector<SmallVector<Value>>>
+    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
   }];
 
@@ -401,24 +409,6 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   );
 
   let extraClassDeclaration = [{
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
-      if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
-        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
-      }
-      return std::nullopt;
-    }
-
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
-      if (DenseI32ArrayAttr data = getParent().getSgData()) {
-        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(data.asArrayRef(), dims);
-      }
-      return std::nullopt;
-    }
-
-    FailureOr<llvm::SmallVector<Value>>
-    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
 
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
@@ -431,6 +421,29 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     bool isSgLayout() const {
       return getParent().isSgLayout();
     }
+
+    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+      if (auto layout = getParent().getEffectiveSgLayout()) {
+        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*layout), dims);
+      }
+      return std::nullopt;
+    }
+
+    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+      if (auto data = getParent().getEffectiveSgData()) {
+        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*data), dims);
+      }
+      return std::nullopt;
+    }
+
+    FailureOr<SmallVector<Value>>
+    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+    FailureOr<SmallVector<SmallVector<Value>>>
+    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+
   }];
 
   let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 91d7b2a137efd..682f0620dbcfb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -213,17 +215,75 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
-                                  OpBuilder &builder) {
-  assert(isWgLayout() && "delinearizeSubgroupId is only available for "
-                         "workgroup-level layout attribute.");
+LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+                                  Value linearId) {
+  // delinearizeSubgroupId is only available for workgroup-level layout
+  // attribute
+  if (!isWgLayout())
+    return failure();
+
   auto dims =
       llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
         return arith::ConstantIndexOp::create(builder, loc, d);
       });
+
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
 
+FailureOr<SmallVector<SmallVector<Value>>>
+LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+                       ArrayRef<int64_t> shape) {
+  if (!isWgLayout())
+    return failure();
+
+  auto sgLayout = getEffectiveSgLayout().value();
+  SmallVector<int64_t> sgShape;
+  if (auto maybeSgShape = getEffectiveSgData())
+    sgShape = maybeSgShape.value();
+  else if (auto ratio = computeShapeRatio(shape, sgLayout))
+    sgShape = ratio.value();
+  else
+    return failure();
+
+  // distUnit[i] is the minimum value between shape[i] and
+  // sgLayout[i] * sgShape[i]
+  SmallVector<int64_t> distUnit = llvm::map_to_vector(
+      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+  // delinearize Ids
+  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  if (failed(maybeIds))
+    return failure();
+  SmallVector<Value> sgIds = *maybeIds;
+
+  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  SmallVector<Value> localOffsets = llvm::map_to_vector(
+      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+        auto &[id, s] = t;
+        Value d = arith::ConstantIndexOp::create(builder, loc, s);
+        return index::MulOp::create(builder, loc, id, d);
+      });
+
+  SmallVector<SmallVector<Value>> offsets;
+  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+    SmallVector<Value> base =
+        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+          return arith::ConstantIndexOp::create(builder, loc, d);
+        });
+
+    SmallVector<Value> adds = llvm::map_to_vector(
+        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+          return arith::AddIOp::create(builder, loc, std::get<0>(t),
+                                       std::get<1>(t));
+        });
+
+    offsets.push_back(adds);
+  }
+
+  return offsets;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -246,9 +306,15 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
-                                 OpBuilder &builder) {
-  return getParent().delinearizeSubgroupId(linearId, loc, builder);
+SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+                                 Value linearId) {
+  return getParent().delinearizeSubgroupId(builder, loc, linearId);
+}
+
+FailureOr<SmallVector<SmallVector<Value>>>
+SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+                      ArrayRef<int64_t> shape) {
+  return failure();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f914914dc6b9f..e3cf5473076e7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -213,7 +213,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     }
 
     auto deLinearizeSgId =
-        layout.delinearizeSubgroupId(adjustedSgId, loc, rewriter);
+        layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
     if (failed(deLinearizeSgId))
       return failure();
     SmallVector<Value> sgIds = *deLinearizeSgId;

>From 3630966307810ff8ee47aa7d95328ebba225724e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:25:52 +0000
Subject: [PATCH 11/29] apply getOffsets in CreateNdDescOp

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 29 +++++++-----
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 44 +++++++++----------
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 42 +++++++++---------
 3 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 682f0620dbcfb..0b5ecfc210281 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -217,14 +217,14 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 FailureOr<SmallVector<Value>>
 LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
                                   Value linearId) {
-  // delinearizeSubgroupId is only available for workgroup-level layout
-  // attribute
+  // delinearizeSubgroupId is only available for
+  // workgroup-level layout attribute
   if (!isWgLayout())
     return failure();
 
   auto dims =
-      llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
-        return arith::ConstantIndexOp::create(builder, loc, d);
+      llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
+        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
       });
 
   return affine::delinearizeIndex(builder, loc, linearId, dims);
@@ -260,25 +260,32 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
   SmallVector<Value> localOffsets = llvm::map_to_vector(
       llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
-        auto &[id, s] = t;
-        Value d = arith::ConstantIndexOp::create(builder, loc, s);
-        return index::MulOp::create(builder, loc, id, d);
+        return builder.createOrFold<index::MulOp>(
+            loc, std::get<0>(t),
+            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
   SmallVector<SmallVector<Value>> offsets;
   for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
-          return arith::ConstantIndexOp::create(builder, loc, d);
+          return builder.create<arith::ConstantIndexOp>(loc, d);
         });
 
     SmallVector<Value> adds = llvm::map_to_vector(
         llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return arith::AddIOp::create(builder, loc, std::get<0>(t),
-                                       std::get<1>(t));
+          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+                                                     std::get<1>(t));
         });
 
-    offsets.push_back(adds);
+    SmallVector<Value> mods = llvm::map_to_vector(
+        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+          return builder.createOrFold<index::RemUOp>(
+              loc, std::get<0>(t),
+              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+        });
+
+    offsets.push_back(mods);
   }
 
   return offsets;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index e3cf5473076e7..af55f176cb84f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -212,39 +212,39 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
           rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
     }
 
-    auto deLinearizeSgId =
-        layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
-    if (failed(deLinearizeSgId))
+    auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
+    if (failed(tdescOffsets))
       return failure();
-    SmallVector<Value> sgIds = *deLinearizeSgId;
-
-    // Calculate distribution unit shape and local offsets for subgroup
-    SmallVector<int64_t> distUnitShape(sgLayout.size());
-    SmallVector<Value> localOffset(sgLayout.size());
-    for (size_t i = 0; i < sgLayout.size(); i++) {
-      distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], wgShape[i]);
-      localOffset[i] =
-          rewriter.createOrFold<index::MulOp>(loc, sgIds[i], sgDataDim[i]);
-    }
-
-    SmallVector<OpFoldResult> originalOffsets = op.getMixedOffsets();
 
     xegpu::TensorDescType newTdescTy =
         xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
                                    layout.dropSgLayoutAndData());
+
     SmallVector<Value> newCreateNdOps;
-    for (SmallVector<int64_t> distUnitBaseAddr :
-         StaticTileOffsetRange(wgShape, distUnitShape)) {
-      SmallVector<OpFoldResult> globalOffsets =
-          calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
-                                 distUnitBaseAddr, distUnitShape);
+    SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+
+    for (auto tdescOffset : *tdescOffsets) {
+      SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
+          llvm::zip_longest(tdescOffset, offset),
+          [&](const auto &t) -> OpFoldResult {
+            std::optional<Value> off = std::get<0>(t);
+            std::optional<OpFoldResult> old = std::get<1>(t);
+            if (!off.has_value())
+              return *old;
+
+            if (!old.has_value() || isZeroInteger(*old))
+              return *off;
+
+            return rewriter.createOrFold<index::AddOp>(
+                loc, *off,
+                getValueOrCreateConstantIndexOp(rewriter, loc, *old));
+          });
 
       auto newCreateNdOp = xegpu::CreateNdDescOp::create(
-          rewriter, loc, newTdescTy, op.getSource(), globalOffsets,
+          rewriter, loc, newTdescTy, op.getSource(), newOffsets,
           op.getMixedSizes(), op.getMixedStrides());
       newCreateNdOps.push_back(newCreateNdOp);
     }
-
     rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
     return success();
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index d51122417fb61..5e6a227e92320 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -4,27 +4,25 @@
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
+  // CHECK-SAME: [[ARG_0:%.*]]: memref<24x32xf32>
   gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
-  // CHECK: %[[SGID:.*]] = gpu.subgroup_id
-  // CHECK: %[[C12:.*]] = arith.constant 12 : index
-  // CHECK: %[[C4:.*]] = arith.constant 4 : index
-  // CHECK: %[[C8:.*]] = arith.constant 8 : index
-  // CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
-  // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
-  // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
-  // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
-  // CHECK: %[[C24:.*]] = arith.constant 24 : index
-  // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
-  // CHECK: %[[C32:.*]] = arith.constant 32 : index
-  // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
-  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
-  // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
-  // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
-  // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
-  // CHECK: gpu.return
+  //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+  //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+  //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+  //CHECK: [[C12:%.+]] = arith.constant 12 : index
+  //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C12]]
+  //CHECK: [[C8:%.+]] = arith.constant 8 : index
+  //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C8]]
+  //CHECK: [[C0:%.+]] = arith.constant 0 : index
+  //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+  //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+  //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+  //CHECK: [[C24:%.+]] = arith.constant 24 : index
+  //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C24]]
+  //CHECK: [[C32:%.+]] = arith.constant 32 : index
+  //CHECK: [[X:%.+]] = index.remu [[UX]], [[C32]]
+  //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<24x32xf32> -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
+
   %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
     -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
   gpu.return
@@ -180,7 +178,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       -> vector<24x1xf32>
     // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
     // CHECK-SAME: : vector<12x1xf32> to vector<12x8xf32>
-    %broadcast = vector.broadcast %load 
+    %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 8], lane_layout = [2, 1], lane_data = [1, 1]>}
       : vector<24x1xf32> to vector<24x8xf32>
     gpu.return
@@ -367,7 +365,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
   // CHECK-LABEL: @subgroup_id_range_nested_if
   gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
     %sg_id = gpu.subgroup_id : index
-    %c1 = arith.constant 1 : i1 
+    %c1 = arith.constant 1 : i1
     %c3 = arith.constant 3 : index
     %c32 = arith.constant 32 : index
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>

>From 398d69beac1e69ef72f23dea5b5649e4dc9a0ffd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:32:43 +0000
Subject: [PATCH 12/29] cleanup

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  1 +
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 59 +++----------------
 2 files changed, 8 insertions(+), 52 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0b5ecfc210281..ef336ce800385 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -222,6 +222,7 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
   if (!isWgLayout())
     return failure();
 
+  // TODO: handle order attribute
   auto dims =
       llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
         return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af55f176cb84f..640d74d3e3715 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -125,39 +125,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
 struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
-  // Calculate offset for each subgroup
-  static SmallVector<OpFoldResult>
-  calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
-                         const SmallVector<OpFoldResult> &originalOffsets,
-                         const SmallVector<Value> &localOffset,
-                         const SmallVector<int64_t> &distUnitBaseAddr,
-                         const SmallVector<int64_t> &distUnitShape) {
-    assert(localOffset.size() == distUnitBaseAddr.size() &&
-           "localOffset and distUnitBaseAddr must have the same rank");
-
-    SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
-                                            originalOffsets.end());
-    size_t rank = localOffset.size();
-    for (size_t i = 0; i < rank; ++i) {
-      size_t dimIdx = originalOffsets.size() - rank + i;
-      Value constOffset =
-          arith::ConstantIndexOp::create(rewriter, loc, distUnitBaseAddr[i]);
-      Value offset =
-          rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
-      Value modValue =
-          arith::ConstantIndexOp::create(rewriter, loc, distUnitShape[i]);
-      Value offsetMod =
-          rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
-      Value origOffset = getValueOrCreateConstantIndexOp(
-          rewriter, loc, originalOffsets[dimIdx]);
-      Value globalOffset =
-          rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
-      globalOffsets[dimIdx] = globalOffset;
-    }
-
-    return globalOffsets;
-  }
-
   LogicalResult
   matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -177,28 +144,14 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       return rewriter.notifyMatchFailure(
           op, "sgLayout attribute is required in layout");
 
-    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
-
-    // TODO : Handle order attribute
     // Get the subgroup ID
-    auto linearSgId =
+    Value linearSgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
-    // Create constants for layout dimensions
-    SmallVector<Value> sgLayoutDim(sgLayout.size());
-    SmallVector<Value> sgDataDim(sgShape.size());
-
-    for (size_t i = 0; i < sgLayout.size(); i++) {
-      sgLayoutDim[i] =
-          arith::ConstantIndexOp::create(rewriter, loc, sgLayout[i]);
-      sgDataDim[i] = arith::ConstantIndexOp::create(rewriter, loc, sgShape[i]);
-    }
-
     int64_t startOfRange = -1, endOfRange = -1;
     bool sgIdRangeSpecified =
         isSgIdRangeSpecified(op, startOfRange, endOfRange);
 
-    Value adjustedSgId = linearSgId;
     if (sgIdRangeSpecified) {
       int64_t sgCount = endOfRange - startOfRange;
       if (computeProduct(sgLayout) != sgCount)
@@ -208,14 +161,16 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       // sg id
       Value startOfRangeVal =
           rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
-      adjustedSgId =
+      linearSgId =
           rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
     }
 
-    auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
-    if (failed(tdescOffsets))
+    auto maybeTdescOffsets =
+        layout.getOffsets(rewriter, loc, linearSgId, wgShape);
+    if (failed(maybeTdescOffsets))
       return failure();
 
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     xegpu::TensorDescType newTdescTy =
         xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
                                    layout.dropSgLayoutAndData());
@@ -223,7 +178,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     SmallVector<Value> newCreateNdOps;
     SmallVector<OpFoldResult> offset = op.getMixedOffsets();
 
-    for (auto tdescOffset : *tdescOffsets) {
+    for (auto tdescOffset : *maybeTdescOffsets) {
       SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
           llvm::zip_longest(tdescOffset, offset),
           [&](const auto &t) -> OpFoldResult {

>From 08e4aa9c6df06e5d7eec54c63c96877dcc1631ac Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:28:40 +0000
Subject: [PATCH 13/29] fix a bug

---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 640d74d3e3715..688e2b25867b3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -179,26 +179,20 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     SmallVector<OpFoldResult> offset = op.getMixedOffsets();
 
     for (auto tdescOffset : *maybeTdescOffsets) {
-      SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
-          llvm::zip_longest(tdescOffset, offset),
-          [&](const auto &t) -> OpFoldResult {
-            std::optional<Value> off = std::get<0>(t);
-            std::optional<OpFoldResult> old = std::get<1>(t);
-            if (!off.has_value())
-              return *old;
-
-            if (!old.has_value() || isZeroInteger(*old))
-              return *off;
-
-            return rewriter.createOrFold<index::AddOp>(
-                loc, *off,
-                getValueOrCreateConstantIndexOp(rewriter, loc, *old));
-          });
-
-      auto newCreateNdOp = xegpu::CreateNdDescOp::create(
+      SmallVector<OpFoldResult> newOffsets;
+      size_t rank = tdescOffset.size();
+      for (size_t i = 0; i < rank; i++) {
+        size_t idx = offset.size() - rank + i;
+        Value newOff = rewriter.createOrFold<index::AddOp>(
+            loc, tdescOffset[i],
+            getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
+        newOffsets.push_back(newOff);
+      }
+
+      auto newOp = xegpu::CreateNdDescOp::create(
           rewriter, loc, newTdescTy, op.getSource(), newOffsets,
           op.getMixedSizes(), op.getMixedStrides());
-      newCreateNdOps.push_back(newCreateNdOp);
+      newCreateNdOps.push_back(newOp);
     }
     rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
     return success();

>From 62aa1dde2f1c47bf3d9b45582c668c33ef64a987 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:36:15 +0000
Subject: [PATCH 14/29] cleanup

---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 688e2b25867b3..dae1f06a8fbad 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -157,8 +157,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       if (computeProduct(sgLayout) != sgCount)
         return rewriter.notifyMatchFailure(
             op, "sg_layout size must match the sg_id_range");
-      // Subtract startOfRange from the original subgroup id to get the adjusted
-      // sg id
+      // Subtract startOfRange from the original subgroup id to get
+      // the adjusted sg id
       Value startOfRangeVal =
           rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
       linearSgId =
@@ -176,17 +176,17 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
                                    layout.dropSgLayoutAndData());
 
     SmallVector<Value> newCreateNdOps;
-    SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+    SmallVector<OpFoldResult> oldOffsets = op.getMixedOffsets();
 
-    for (auto tdescOffset : *maybeTdescOffsets) {
+    for (auto tdescOffsets : *maybeTdescOffsets) {
       SmallVector<OpFoldResult> newOffsets;
-      size_t rank = tdescOffset.size();
+      size_t rank = tdescOffsets.size();
       for (size_t i = 0; i < rank; i++) {
-        size_t idx = offset.size() - rank + i;
-        Value newOff = rewriter.createOrFold<index::AddOp>(
-            loc, tdescOffset[i],
-            getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
-        newOffsets.push_back(newOff);
+        size_t idx = oldOffsets.size() - rank + i;
+        Value add = rewriter.createOrFold<index::AddOp>(
+            loc, tdescOffsets[i],
+            getValueOrCreateConstantIndexOp(rewriter, loc, oldOffsets[idx]));
+        newOffsets.push_back(add);
       }
 
       auto newOp = xegpu::CreateNdDescOp::create(

>From de0a1bbc63ac3eb04ae1e900a892dba8d03005f0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 17:18:09 +0000
Subject: [PATCH 15/29] add unit test

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |   4 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  65 ++++++++++-
 mlir/test/Dialect/XeGPU/layout.mlir           |   6 +
 .../Dialect/XeGPU/xegpu-attr-interface.mlir   |  23 ++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 107 ++++++++++++++++++
 5 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5794f786dc9b9..4f35e3ff061a4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -410,6 +410,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
 
   let extraClassDeclaration = [{
 
+    int64_t getRank() const {
+      return getParent().getRank() - getDims().size();
+    }
+
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
     }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ef336ce800385..fad3c6280fbbe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -296,7 +296,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
 LogicalResult
-SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
                   xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
   if (!parent || !dims)
     return emitError() << "expected parent layout and dims attribute";
@@ -322,7 +322,68 @@ SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
 FailureOr<SmallVector<SmallVector<Value>>>
 SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
                       ArrayRef<int64_t> shape) {
-  return failure();
+  assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
+  if (!isWgLayout())
+    return failure();
+
+  auto sgLayout = getEffectiveSgLayout().value();
+
+  SmallVector<int64_t> sgShape;
+  if (auto maybeSgShape = getEffectiveSgData())
+    sgShape = maybeSgShape.value();
+  else if (auto ratio = computeShapeRatio(shape, sgLayout))
+    sgShape = ratio.value();
+  else
+    return failure();
+
+  // distUnit[i] is the minimum value between shape[i] and
+  // sgLayout[i] * sgShape[i]
+  SmallVector<int64_t> distUnit = llvm::map_to_vector(
+      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+  // delinearize Ids
+  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  if (failed(maybeIds))
+    return failure();
+  // The effective sgIds for offsets computing correspond
+  // to the dims that are not sliced.
+  ArrayRef<int64_t> dims = getDims().asArrayRef();
+  SmallVector<Value> sgIds =
+      XeGPUDialect::dropDims(ArrayRef<Value>(*maybeIds), dims);
+
+  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  SmallVector<Value> localOffsets = llvm::map_to_vector(
+      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+        return builder.createOrFold<index::MulOp>(
+            loc, std::get<0>(t),
+            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+      });
+
+  SmallVector<SmallVector<Value>> offsets;
+  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+    SmallVector<Value> base =
+        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+          return builder.create<arith::ConstantIndexOp>(loc, d);
+        });
+
+    SmallVector<Value> adds = llvm::map_to_vector(
+        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+                                                     std::get<1>(t));
+        });
+
+    SmallVector<Value> mods = llvm::map_to_vector(
+        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+          return builder.createOrFold<index::RemUOp>(
+              loc, std::get<0>(t),
+              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+        });
+
+    offsets.push_back(mods);
+  }
+
+  return offsets;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 017dacc8d629a..e5330951b065a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,4 +50,10 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
   gpu.return
 }
 
+gpu.func @slice_attr_repeat_dim() {
+  //CHECK: arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+  %cst = arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+  gpu.return
+}
+
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
new file mode 100644
index 0000000000000..6397b7fe525b8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
+
+#block = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>
+#slice = #xegpu.slice<#block, dims=[1]>
+
+//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
+gpu.module @test_1_1_assignment {
+  gpu.func @create_nd_tdesc() -> vector<128xindex> {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    %step = vector.step {layout_result_0 = #slice}: vector<128xindex>
+    gpu.return %step : vector<128xindex>
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index f71fcf7ca297b..1e96280769060 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -7,11 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
@@ -149,12 +152,116 @@ struct TestXeGPUUnrollingPatterns
   }
 };
 
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "test-xegpu-layout-interface"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
+  using OpConversionPattern<vector::StepOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto layoutName = xegpu::getLayoutName(op->getResult(0));
+    auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+    if (!sliceAttr || sliceAttr.getRank() != 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> sgShape =
+        sliceAttr.getEffectiveSgData();
+    if (!sgShape)
+      return failure();
+
+    Location loc = op.getLoc();
+    VectorType type = op.getResult().getType();
+    auto wgShape = type.getShape();
+
+    Value sgId =
+        gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+    auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+    if (failed(maybeOffsets))
+      return failure();
+
+    VectorType newTy = type.cloneWith(*sgShape, type.getElementType());
+    Value base = vector::StepOp::create(rewriter, loc, newTy);
+    SmallVector<Value> newOps;
+    for (auto offsets : *maybeOffsets) {
+      Value bcast =
+          vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
+      Value add = arith::AddIOp::create(rewriter, loc, base, bcast);
+      newOps.push_back(add);
+    }
+    rewriter.replaceOpWithMultiple(op, {newOps});
+    return success();
+  }
+};
+
+struct TestXeGPULayoutInterface
+    : public PassWrapper<TestXeGPULayoutInterface,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPULayoutInterface)
+
+  StringRef getArgument() const final { return "test-xegpu-layout-interface"; }
+
+  StringRef getDescription() const final {
+    return "Test the implementation of XeGPU Layout interfaces";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+  }
+
+  TestXeGPULayoutInterface() = default;
+  TestXeGPULayoutInterface(const TestXeGPULayoutInterface &pass)
+      : PassWrapper(pass) {}
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    TypeConverter typeConverter;
+    auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                               mlir::ValueRange inputs,
+                               mlir::Location loc) -> mlir::Value {
+      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+          .getResult(0);
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<TestStepOpPattern>(typeConverter, ctx);
+
+    ConversionTarget target(*ctx);
+    auto isLegal = [&](xegpu::SliceAttr layout) -> bool {
+      return !layout || !layout.isWgLayout();
+    };
+
+    target.addDynamicallyLegalOp<vector::StepOp>(
+        [&](vector::StepOp op) -> bool {
+          auto layoutName = xegpu::getLayoutName(op->getResult(0));
+          auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+          return isLegal(sliceAttr);
+        });
+
+    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+    (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+  }
+};
+
 } // namespace
 
 namespace mlir {
 namespace test {
 void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
+  PassRegistration<TestXeGPULayoutInterface>();
 }
 } // namespace test
 } // namespace mlir

>From e7f2977e79bca34b5bf6fabda74d95d4c934fd7e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 19:24:09 +0000
Subject: [PATCH 16/29] fix a typo

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fad3c6280fbbe..835da3a52885e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -280,7 +280,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
         });
 
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
@@ -374,7 +374,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
         });
 
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));

>From e3e4a618b65e7f6375d66d00d87ced9eac4b7629 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 22:50:59 +0000
Subject: [PATCH 17/29] add unit test

---
 mlir/test/Dialect/XeGPU/layout.mlir | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index e5330951b065a..af13f69ab2d8a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -56,4 +56,15 @@ gpu.func @slice_attr_repeat_dim() {
   gpu.return
 }
 
+gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
+  %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
+  %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+  //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+  %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+  %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+  gpu.return %3 : vector<256x128xf32>
+}
+
 }

>From 3f59105caa7a2b07055d98e5d503a4bbc348d1d4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 4 Aug 2025 16:22:04 +0000
Subject: [PATCH 18/29] fix conflicts

---
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 38 ++++++++++-----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index df781b951f4f1..180ba8a162c9f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -6,26 +6,24 @@ gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-  //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-  //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-  //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-  //CHECK: [[C12:%.+]] = arith.constant 12 : index
-  //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C12]]
-  //CHECK: [[C8:%.+]] = arith.constant 8 : index
-  //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C8]]
-  //CHECK: [[C0:%.+]] = arith.constant 0 : index
-  //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-  //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
-  //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
-  //CHECK: [[C24:%.+]] = arith.constant 24 : index
-  //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C24]]
-  //CHECK: [[C32:%.+]] = arith.constant 32 : index
-  //CHECK: [[X:%.+]] = index.remu [[UX]], [[C32]]
-  //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<24x32xf32> -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
-
-  %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-    -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
+    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+    //CHECK: [[C32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
+    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
+    //CHECK: [[C0:%.+]] = arith.constant 0 : index
+    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+    //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+    //CHECK: [[C256:%.+]] = arith.constant 256 : index
+    //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+    //CHECK: [[C128:%.+]] = arith.constant 128 : index
+    //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
   }
 
   // CHECK-LABEL: load_nd_tdesc

>From 129312a92633e9ef702e282fb2ee139105706fce Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 4 Aug 2025 19:08:36 +0000
Subject: [PATCH 19/29] address comments

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td     |  6 +++---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td   | 10 +++++-----
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt             |  2 ++
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp           |  2 +-
 .../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp       | 12 ++++++------
 mlir/test/Dialect/XeGPU/layout.mlir                  | 10 +++++-----
 6 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4f35e3ff061a4..364525444769b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -429,7 +429,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
       if (auto layout = getParent().getEffectiveSgLayout()) {
         ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*layout), dims);
+        return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
       }
       return std::nullopt;
     }
@@ -437,7 +437,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
       if (auto data = getParent().getEffectiveSgData()) {
         ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*data), dims);
+        return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
       }
       return std::nullopt;
     }
@@ -450,7 +450,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
 
   }];
 
-  let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
+  let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index f07a758a59b96..76d58e5ea2424 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -42,13 +42,13 @@ def XeGPU_Dialect : Dialect {
       /// and data factors provided by the LayoutAttr.
       static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
 
-      /// drops the data in the specified dimension, and return the rest. e.g.,
-      /// for data = [32, 64, 8], dropPositions = [0, 2], it will return [64]
+      /// drops/slices the shape in the specified dims, and return the rest. e.g.,
+      /// for shape = [32, 64, 8], dims = [0, 2], it will return [64]
       template<typename T, typename U>
-      static llvm::SmallVector<T> dropDims(llvm::ArrayRef<T> data, llvm::ArrayRef<U> dropPositions) {
+      static llvm::SmallVector<T> slice(llvm::ArrayRef<T> shape, llvm::ArrayRef<U> dims) {
         llvm::SmallVector<T> result;
-        for (auto [i, v]: llvm::enumerate(data)) {
-          if (!llvm::is_contained(dropPositions, i))
+        for (auto [i, v]: llvm::enumerate(shape)) {
+          if (!llvm::is_contained(dims, i))
             result.push_back(v);
         }
         return result;
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 89d986143e965..7c6a4f37db9af 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -13,6 +13,8 @@ add_mlir_dialect_library(MLIRXeGPUDialect
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRIndexDialect
+  MLIRAffineUtils
   MLIRArithUtils
   MLIRDialectUtils
   MLIRIR
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 835da3a52885e..502b45a8181e2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -350,7 +350,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   // to the dims that are not sliced.
   ArrayRef<int64_t> dims = getDims().asArrayRef();
   SmallVector<Value> sgIds =
-      XeGPUDialect::dropDims(ArrayRef<Value>(*maybeIds), dims);
+      XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
   // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
   SmallVector<Value> localOffsets = llvm::map_to_vector(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0a52f7769ea7a..b0600273b423c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -176,21 +176,21 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
                                    layout.dropSgLayoutAndData());
 
     SmallVector<Value> newCreateNdOps;
-    SmallVector<OpFoldResult> oldOffsets = op.getMixedOffsets();
+    SmallVector<OpFoldResult> wgTileOffsets = op.getMixedOffsets();
 
     for (auto tdescOffsets : *maybeTdescOffsets) {
-      SmallVector<OpFoldResult> newOffsets;
+      SmallVector<OpFoldResult> sgTileOffsets;
       size_t rank = tdescOffsets.size();
       for (size_t i = 0; i < rank; i++) {
-        size_t idx = oldOffsets.size() - rank + i;
+        size_t idx = wgTileOffsets.size() - rank + i;
         Value add = rewriter.createOrFold<index::AddOp>(
             loc, tdescOffsets[i],
-            getValueOrCreateConstantIndexOp(rewriter, loc, oldOffsets[idx]));
-        newOffsets.push_back(add);
+            getValueOrCreateConstantIndexOp(rewriter, loc, wgTileOffsets[idx]));
+        sgTileOffsets.push_back(add);
       }
 
       auto newOp = xegpu::CreateNdDescOp::create(
-          rewriter, loc, newTdescTy, op.getSource(), newOffsets,
+          rewriter, loc, newTdescTy, op.getSource(), sgTileOffsets,
           op.getMixedSizes(), op.getMixedStrides());
       newCreateNdOps.push_back(newOp);
     }
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index af13f69ab2d8a..ac0670cf63f94 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,17 +50,17 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
   gpu.return
 }
 
-gpu.func @slice_attr_repeat_dim() {
-  //CHECK: arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
-  %cst = arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+gpu.func @slice_attr() {
+  //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+  %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
   gpu.return
 }
 
 gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
   %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
   %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
-  //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
-  %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
   //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
   %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
   %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>

>From 0865612c7899dae0c14febc31e168f8a07a73408 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:13:14 +0000
Subject: [PATCH 20/29] add support for nested SliceAttr

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |  1 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 36 +++++++++++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 39 +++++++++++++++++--
 mlir/test/Dialect/XeGPU/layout.mlir           |  6 +++
 4 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index eb74b8142688f..3592da4c46364 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -24,6 +24,7 @@ namespace mlir {
 namespace xegpu {
 class TensorDescType;
 class LayoutAttr;
+class SliceAttr;
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 364525444769b..1cc3775998852 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -182,6 +182,9 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
   }];
 
   let methods = [
+    InterfaceMethod<"Get the rank of attribute",
+                    "int64_t",
+                    "getRank">,
     InterfaceMethod<"Get the effective sg layout",
                     "std::optional<SmallVector<int64_t>>",
                     "getEffectiveSgLayout">,
@@ -192,7 +195,6 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "FailureOr<SmallVector<Value>>",
                     "delinearizeSubgroupId",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-
     InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
                     "FailureOr<SmallVector<SmallVector<Value>>>",
                     "getOffsets",
@@ -404,30 +406,40 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   }];
 
   let parameters = (ins
-    "xegpu::LayoutAttr": $parent,
+    "xegpu::LayoutTrait": $parent,
     "DenseI64ArrayAttr": $dims
   );
 
   let extraClassDeclaration = [{
 
     int64_t getRank() const {
-      return getParent().getRank() - getDims().size();
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      return parent.getRank() - attr.getDims().size();
     }
 
     DenseI32ArrayAttr getOrder() const {
-      return getParent().getOrder();
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      return parent.getOrder();
     }
 
     bool isWgLayout() const {
-      return getParent().isWgLayout();
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      return parent.isWgLayout();
     }
 
     bool isSgLayout() const {
-      return getParent().isSgLayout();
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      return parent.isSgLayout();
     }
 
     std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
-      if (auto layout = getParent().getEffectiveSgLayout()) {
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      if (auto layout = parent.getEffectiveSgLayout()) {
         ArrayRef<int64_t> dims = getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
       }
@@ -435,13 +447,21 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     }
 
     std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
-      if (auto data = getParent().getEffectiveSgData()) {
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      if (auto data = parent.getEffectiveSgData()) {
         ArrayRef<int64_t> dims = getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
       }
       return std::nullopt;
     }
 
+    /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
+    /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
+    /// it will coalese two slice operations and return a simplified SliceAttr
+    /// #xegpu.slice<#xegpu.layout<sg_laout = [4, 8, 12]>, dims = [0, 1]>
+    SliceAttr flatten() const;
+
     FailureOr<SmallVector<Value>>
     delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 502b45a8181e2..396e0d30d5974 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
 
 using std::optional;
 
@@ -297,11 +298,12 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
 //===----------------------------------------------------------------------===//
 LogicalResult
 SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
-                  xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
+                  xegpu::LayoutTrait parent, DenseI64ArrayAttr dims) {
   if (!parent || !dims)
     return emitError() << "expected parent layout and dims attribute";
 
-  int rank = parent.getRank();
+  int64_t rank = parent.getRank();
+
   // check every element in dims is unique and smaller than rank
   llvm::SmallDenseSet<int64_t> seen;
   for (int64_t dim : dims.asArrayRef()) {
@@ -313,10 +315,41 @@ SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+SliceAttr SliceAttr::flatten() const {
+  xegpu::LayoutTrait parent = getParent();
+  SmallVector<DenseI64ArrayAttr> slicedDims({getDims()});
+
+  while (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(parent)) {
+    parent = sliceAttr.getParent();
+    slicedDims.push_back(sliceAttr.getDims());
+  }
+
+  auto layoutAttr = dyn_cast<xegpu::LayoutAttr>(parent);
+  SmallVector<int64_t> indices =
+      llvm::to_vector(llvm::seq<int64_t>(0, layoutAttr.getRank()));
+
+  // get remaining dims (flattend) by applying slice ops with all slicedDims
+  SmallVector<int64_t> remainingIndices(indices);
+  for (auto dim : llvm::reverse(slicedDims))
+    remainingIndices = XeGPUDialect::slice(
+        llvm::ArrayRef<int64_t>(remainingIndices), dim.asArrayRef());
+
+  // get flattend sliced dims by applying slice ops with the remaining dims
+  SmallVector<int64_t> flattendDims =
+      XeGPUDialect::slice(llvm::ArrayRef<int64_t>(indices),
+                          llvm::ArrayRef<int64_t>(remainingIndices));
+
+  return xegpu::SliceAttr::get(
+      getContext(), layoutAttr,
+      DenseI64ArrayAttr::get(getContext(), flattendDims));
+}
+
 FailureOr<SmallVector<Value>>
 SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
                                  Value linearId) {
-  return getParent().delinearizeSubgroupId(builder, loc, linearId);
+  SliceAttr attr = flatten();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+  return parent.delinearizeSubgroupId(builder, loc, linearId);
 }
 
 FailureOr<SmallVector<SmallVector<Value>>>
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index ac0670cf63f94..e4b4e22e5cf97 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -56,6 +56,12 @@ gpu.func @slice_attr() {
   gpu.return
 }
 
+gpu.func @nested_slice_attr() {
+  //CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+  %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>, dims = [1]>} dense<8> : vector<16xindex>
+  gpu.return
+}
+
 gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
   %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
   %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>

>From b67f2b193bd464a7a666a47fe0e0227a35c24b8e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:30:47 +0000
Subject: [PATCH 21/29] add unit test for nested slice attr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  6 ++---
 .../Dialect/XeGPU/xegpu-attr-interface.mlir   | 26 ++++++++++++++-----
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 1cc3775998852..17ea8b09bb26e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -440,7 +440,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
       if (auto layout = parent.getEffectiveSgLayout()) {
-        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
       }
       return std::nullopt;
@@ -450,7 +450,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
       if (auto data = parent.getEffectiveSgData()) {
-        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
       }
       return std::nullopt;
@@ -459,7 +459,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
     /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
     /// it will coalese two slice operations and return a simplified SliceAttr
-    /// #xegpu.slice<#xegpu.layout<sg_laout = [4, 8, 12]>, dims = [0, 1]>
+    /// #xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0, 1]>
     SliceAttr flatten() const;
 
     FailureOr<SmallVector<Value>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index 6397b7fe525b8..547c7355e00c6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -1,11 +1,8 @@
 // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
 
-#block = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>
-#slice = #xegpu.slice<#block, dims=[1]>
-
 //CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
-gpu.module @test_1_1_assignment {
-  gpu.func @create_nd_tdesc() -> vector<128xindex> {
+gpu.module @test {
+  gpu.func @slice_attr() -> vector<128xindex> {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
     //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
@@ -17,7 +14,24 @@ gpu.module @test_1_1_assignment {
     //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
     //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
-    %step = vector.step {layout_result_0 = #slice}: vector<128xindex>
+    %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return %step : vector<128xindex>
   }
+
+  gpu.func @nested_slice_attr() -> vector<128xindex> {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
+    gpu.return %0 : vector<128xindex>
+  }
+
 }
\ No newline at end of file

>From 01e4efe315015c2440206b169ce9b4e2366ce2f1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 17:59:43 +0000
Subject: [PATCH 22/29] cleanup

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 134 +++++++++------------
 1 file changed, 54 insertions(+), 80 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 396e0d30d5974..77c06c2f65da9 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,6 +37,54 @@ void XeGPUDialect::initialize() {
       >();
 }
 
+// generate offsets computing instructions for a subgroup
+// represented by a nd indices (sgId), given the subgroup layout (sgLayout),
+// the subgroup data size (sgShape), and the overall data size (shape)
+static SmallVector<SmallVector<Value>>
+genOffsetsComputations(OpBuilder &builder, Location loc,
+                       SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
+                       ArrayRef<int64_t> sgShape, ArrayRef<int64_t> shape) {
+
+  SmallVector<SmallVector<Value>> offsets;
+
+  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  SmallVector<Value> localOffsets = llvm::map_to_vector(
+      llvm::zip(sgId, sgShape), [&](const auto &t) -> Value {
+        return builder.createOrFold<index::MulOp>(
+            loc, std::get<0>(t),
+            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+      });
+
+  // distUnit[i] is the minimum value between shape[i] and
+  // sgLayout[i] * sgShape[i]
+  SmallVector<int64_t> distUnit = llvm::map_to_vector(
+      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+    SmallVector<Value> base =
+        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+          return builder.create<arith::ConstantIndexOp>(loc, d);
+        });
+
+    SmallVector<Value> adds = llvm::map_to_vector(
+        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+                                                     std::get<1>(t));
+        });
+
+    SmallVector<Value> mods = llvm::map_to_vector(
+        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
+          return builder.createOrFold<index::RemUOp>(
+              loc, std::get<0>(t),
+              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+        });
+
+    offsets.push_back(mods);
+  }
+  return offsets;
+}
+
 // Checks if the given shape can be evenly distributed based on the layout
 // and data factors provided by the LayoutAttr.
 bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
@@ -238,7 +286,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   if (!isWgLayout())
     return failure();
 
-  auto sgLayout = getEffectiveSgLayout().value();
+  SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
   SmallVector<int64_t> sgShape;
   if (auto maybeSgShape = getEffectiveSgData())
     sgShape = maybeSgShape.value();
@@ -247,50 +295,13 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   else
     return failure();
 
-  // distUnit[i] is the minimum value between shape[i] and
-  // sgLayout[i] * sgShape[i]
-  SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
-      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
   // delinearize Ids
   auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
   SmallVector<Value> sgIds = *maybeIds;
 
-  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
-  SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
-        return builder.createOrFold<index::MulOp>(
-            loc, std::get<0>(t),
-            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
-      });
-
-  SmallVector<SmallVector<Value>> offsets;
-  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
-    SmallVector<Value> base =
-        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
-          return builder.create<arith::ConstantIndexOp>(loc, d);
-        });
-
-    SmallVector<Value> adds = llvm::map_to_vector(
-        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
-                                                     std::get<1>(t));
-        });
-
-    SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
-          return builder.createOrFold<index::RemUOp>(
-              loc, std::get<0>(t),
-              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
-        });
-
-    offsets.push_back(mods);
-  }
-
-  return offsets;
+  return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -359,8 +370,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   if (!isWgLayout())
     return failure();
 
-  auto sgLayout = getEffectiveSgLayout().value();
-
+  SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
   SmallVector<int64_t> sgShape;
   if (auto maybeSgShape = getEffectiveSgData())
     sgShape = maybeSgShape.value();
@@ -369,54 +379,18 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   else
     return failure();
 
-  // distUnit[i] is the minimum value between shape[i] and
-  // sgLayout[i] * sgShape[i]
-  SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
-      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
   // delinearize Ids
   auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
+
   // The effective sgIds for offsets computing correspond
   // to the dims that are not sliced.
-  ArrayRef<int64_t> dims = getDims().asArrayRef();
+  ArrayRef<int64_t> dims = flatten().getDims().asArrayRef();
   SmallVector<Value> sgIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
-  SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
-        return builder.createOrFold<index::MulOp>(
-            loc, std::get<0>(t),
-            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
-      });
-
-  SmallVector<SmallVector<Value>> offsets;
-  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
-    SmallVector<Value> base =
-        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
-          return builder.create<arith::ConstantIndexOp>(loc, d);
-        });
-
-    SmallVector<Value> adds = llvm::map_to_vector(
-        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
-                                                     std::get<1>(t));
-        });
-
-    SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
-          return builder.createOrFold<index::RemUOp>(
-              loc, std::get<0>(t),
-              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
-        });
-
-    offsets.push_back(mods);
-  }
-
-  return offsets;
+  return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
 }
 
 //===----------------------------------------------------------------------===//

>From 3077c6c83632737d10493edbec7c5919cdd6af91 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 5 Aug 2025 16:37:19 -0500
Subject: [PATCH 23/29] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

Co-authored-by: Charitha Saumya <136391709+charithaintc at users.noreply.github.com>
---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 17ea8b09bb26e..bd162e98557f4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -390,7 +390,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
     However, whereas LayoutAttr requires the data to have the same rank as the attribute,
     SliceAttr permits the data to have a lower rank. In this case, compute units in the
-    specified dimensions share the data, provided that the remaining ranks match the data
+    specified dimensions (given by `$dims`) share the data, provided that the remaining ranks match the data
     rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
     vector.broadcast.
 

>From d1f7bac594173ffa1a37ff034f9b417da87748ee Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 16:00:16 +0000
Subject: [PATCH 24/29] update docs

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 56 +++++++++++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 28 ++++++----
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  6 +-
 3 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bd162e98557f4..1f420c13ebae0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -185,17 +185,21 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
     InterfaceMethod<"Get the rank of attribute",
                     "int64_t",
                     "getRank">,
-    InterfaceMethod<"Get the effective sg layout",
+    InterfaceMethod<"Get the SgLayout field of the attribute as integer array",
                     "std::optional<SmallVector<int64_t>>",
-                    "getEffectiveSgLayout">,
-    InterfaceMethod<"Get the effective sg data",
+                    "getSgLayoutAsInt">,
+    InterfaceMethod<"Get the SgData field of the attribute as integer array",
                     "std::optional<SmallVector<int64_t>>",
-                    "getEffectiveSgData">,
-    InterfaceMethod<"Delinearize the Subgroup Id",
+                    "getSgDataAsInt">,
+    InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional
+                      indices based on the effective subgroup layout.}],
                     "FailureOr<SmallVector<Value>>",
                     "delinearizeSubgroupId",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-    InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
+    InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks
+                      assigned to a subgroup identified by linearId. The shape parameter
+                      represents the workgroup-level problem size. Each subgroup may access
+                      multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
                     "getOffsets",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
@@ -358,21 +362,27 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
-    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+    std::optional<SmallVector<int64_t>> getSgLayoutAsInt() const {
       if (DenseI32ArrayAttr layout = getSgLayout())
         return llvm::to_vector_of<int64_t>(layout.asArrayRef());
       return std::nullopt;
     }
 
-    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+    std::optional<SmallVector<int64_t>> getSgDataAsInt() const {
       if (DenseI32ArrayAttr data = getSgData())
         return llvm::to_vector_of<int64_t>(data.asArrayRef());
       return std::nullopt;
     }
 
+    /// Delinearizes a linear subgroup ID into its multidimensional indices
+    /// based on the effective subgroup layout.
     FailureOr<SmallVector<Value>>
     delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
 
+    /// Generates instructions to compute multidimensional offsets for blocks
+    /// assigned to a subgroup identified by linearId. The shape parameter
+    /// represents the workgroup-level problem size. Each subgroup may access
+    /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
     getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
@@ -390,19 +400,23 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
     However, whereas LayoutAttr requires the data to have the same rank as the attribute,
     SliceAttr permits the data to have a lower rank. In this case, compute units in the
-    specified dimensions (given by `$dims`) share the data, provided that the remaining ranks match the data
-    rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
-    vector.broadcast.
+    specified dimensions (given by `$dims`) share the data, provided that the remaining
+    ranks match the data rank. SliceAttr is commonly used by operations such as
+    vector.multi_reduction and vector.broadcast.
 
     Example:
     ```
     #l = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
-    #r = #xegpu.slice<#l, dim = 0>
+    #r = #xegpu.slice<#l, dim = [0]>
 
     %exp = math.exp %input {layout_result_0 = #l}: vector<256x128xf32>
     %red = vector.multi_reduction<add>, %exp, %acc [0] {layout_result_0 = #r}: vector<256x128xf32> to vector<128xf32>
     %bcast = vector.broadcast %red {layout_result_0 = #l} : vector<128xf32> to vector<256x128xf32>
     ```
+    In this example, %red is conceptually divided into 4 vectors of type vector<32xf32>, each assigned to
+    a group of subgroups. Each group consists of 8 subgroups from the same column of sg_layout, sharing a
+    single reduction result of type vector<32xf32>.
+
   }];
 
   let parameters = (ins
@@ -436,20 +450,24 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
       return parent.isSgLayout();
     }
 
-    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+    /// Returns the SgLayout of the attribute, computed by applying
+    /// the slice dimensions to the underlying LayoutAttr.
+    std::optional<SmallVector<int64_t>> getSgLayoutAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      if (auto layout = parent.getEffectiveSgLayout()) {
+      if (auto layout = parent.getSgLayoutAsInt()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*layout), dims);
       }
       return std::nullopt;
     }
 
-    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+    /// Returns the SgData of the attribute, computed by applying
+    /// the slice dimensions to the underlying LayoutAttr.
+    std::optional<SmallVector<int64_t>> getSgDataAsInt() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      if (auto data = parent.getEffectiveSgData()) {
+      if (auto data = parent.getSgDataAsInt()) {
         ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
         return XeGPUDialect::slice(llvm::ArrayRef<int64_t>(*data), dims);
       }
@@ -462,9 +480,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     /// #xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0, 1]>
     SliceAttr flatten() const;
 
+    /// Delinearizes a linear subgroup ID into its multidimensional indices
+    /// based on the effective subgroup layout.
     FailureOr<SmallVector<Value>>
     delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
 
+    /// Generates instructions to compute multidimensional offsets for blocks
+    /// assigned to a subgroup identified by linearId. The shape parameter
+    /// represents the workgroup-level problem size. Each subgroup may access
+    /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
     getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 77c06c2f65da9..25ff7cba92a83 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,9 +37,10 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-// generate offsets computing instructions for a subgroup
-// represented by a nd indices (sgId), given the subgroup layout (sgLayout),
-// the subgroup data size (sgShape), and the overall data size (shape)
+/// Generates instructions to compute offsets for a subgroup identified by
+/// its multidimensional indices (sgId), using the specified subgroup layout
+/// (sgLayout), subgroup data dimensions (sgShape), and the overall data
+/// dimensions (shape).
 static SmallVector<SmallVector<Value>>
 genOffsetsComputations(OpBuilder &builder, Location loc,
                        SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
@@ -272,23 +273,24 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
     return failure();
 
   // TODO: handle order attribute
-  auto dims =
-      llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
-        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-      });
+  auto dims = llvm::map_to_vector(*getSgLayoutAsInt(), [&](int64_t d) -> Value {
+    return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
+  });
 
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
 
+/// Implements LayoutTrait::getOffsets to generate instructions for
+/// computing multi-dimensional offsets when distributed by LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
 LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
                        ArrayRef<int64_t> shape) {
   if (!isWgLayout())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
+  SmallVector<int64_t> sgLayout = getSgLayoutAsInt().value();
   SmallVector<int64_t> sgShape;
-  if (auto maybeSgShape = getEffectiveSgData())
+  if (auto maybeSgShape = getSgDataAsInt())
     sgShape = maybeSgShape.value();
   else if (auto ratio = computeShapeRatio(shape, sgLayout))
     sgShape = ratio.value();
@@ -318,7 +320,7 @@ SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
   // check every element in dims is unique and smaller than rank
   llvm::SmallDenseSet<int64_t> seen;
   for (int64_t dim : dims.asArrayRef()) {
-    if (dim >= rank)
+    if (dim < 0 || dim >= rank)
       return emitError() << "invalid dim (" << dim << ") in slice attribute.";
     if (!seen.insert(dim).second)
       return emitError() << "repeated dim (" << dim << ") in slice attribute.";
@@ -363,6 +365,8 @@ SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
   return parent.delinearizeSubgroupId(builder, loc, linearId);
 }
 
+/// Implements LayoutTrait::getOffsets to generate instructions for
+/// computing multi-dimensional offsets when distributed by SliceAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
 SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
                       ArrayRef<int64_t> shape) {
@@ -370,9 +374,9 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   if (!isWgLayout())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayout().value();
+  SmallVector<int64_t> sgLayout = getSgLayoutAsInt().value();
   SmallVector<int64_t> sgShape;
-  if (auto maybeSgShape = getEffectiveSgData())
+  if (auto maybeSgShape = getSgDataAsInt())
     sgShape = maybeSgShape.value();
   else if (auto ratio = computeShapeRatio(shape, sgLayout))
     sgShape = ratio.value();
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 4cd662f0f6980..3bea8efcdb0ae 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -155,6 +155,9 @@ struct TestXeGPUUnrollingPatterns
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
+// Test pattern for distributing vector::StepOp from workgroup to subgroup.
+// Validates LayoutTrait interfaces for offset computation abstraction between
+// LayoutAttr and SliceAttr.
 class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
   using OpConversionPattern<vector::StepOp>::OpConversionPattern;
 
@@ -167,8 +170,7 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
     if (!sliceAttr || sliceAttr.getRank() != 1)
       return failure();
 
-    std::optional<SmallVector<int64_t>> sgShape =
-        sliceAttr.getEffectiveSgData();
+    std::optional<SmallVector<int64_t>> sgShape = sliceAttr.getSgDataAsInt();
     if (!sgShape)
       return failure();
 

>From 27da02a9ba57d19aac0c070aedfe5b630350dfff Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 17:45:07 +0000
Subject: [PATCH 25/29] add check for order attribute

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 25ff7cba92a83..e9c6a8eed3dfb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -273,6 +273,14 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
     return failure();
 
   // TODO: handle order attribute
+  auto hasDefaultOrder = [&]() {
+    DenseI32ArrayAttr order = getOrder();
+    return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
+                         llvm::reverse(order.asArrayRef())));
+  };
+  if (!hasDefaultOrder())
+    return mlir::emitError(loc, "order attribute is currently not supported.");
+
   auto dims = llvm::map_to_vector(*getSgLayoutAsInt(), [&](int64_t d) -> Value {
     return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
   });

>From e49e1cf52ce5fe7ef67a99d429ef35d28f51ab12 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 17:49:33 +0000
Subject: [PATCH 26/29] clean up

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index e9c6a8eed3dfb..78b3cbdedecf8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -350,15 +350,14 @@ SliceAttr SliceAttr::flatten() const {
       llvm::to_vector(llvm::seq<int64_t>(0, layoutAttr.getRank()));
 
   // get remaining dims (flattend) by applying slice ops with all slicedDims
-  SmallVector<int64_t> remainingIndices(indices);
+  SmallVector<int64_t> remainingDims(indices);
   for (auto dim : llvm::reverse(slicedDims))
-    remainingIndices = XeGPUDialect::slice(
-        llvm::ArrayRef<int64_t>(remainingIndices), dim.asArrayRef());
+    remainingDims = XeGPUDialect::slice(llvm::ArrayRef<int64_t>(remainingDims),
+                                        dim.asArrayRef());
 
   // get flattend sliced dims by applying slice ops with the remaining dims
-  SmallVector<int64_t> flattendDims =
-      XeGPUDialect::slice(llvm::ArrayRef<int64_t>(indices),
-                          llvm::ArrayRef<int64_t>(remainingIndices));
+  SmallVector<int64_t> flattendDims = XeGPUDialect::slice(
+      llvm::ArrayRef<int64_t>(indices), llvm::ArrayRef<int64_t>(remainingDims));
 
   return xegpu::SliceAttr::get(
       getContext(), layoutAttr,

>From 59de4502a82af59202a952ce56635f967fcbd1a1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 6 Aug 2025 19:19:56 +0000
Subject: [PATCH 27/29] clean up

---
 mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt    |  2 +-
 .../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp       | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index bbbeb71410a9b..728f1aa859061 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -17,4 +17,4 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
 mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
 mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
 add_public_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
-add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
\ No newline at end of file
+add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index b0600273b423c..4a5525c8abb30 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -176,21 +176,21 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
                                    layout.dropSgLayoutAndData());
 
     SmallVector<Value> newCreateNdOps;
-    SmallVector<OpFoldResult> wgTileOffsets = op.getMixedOffsets();
+    SmallVector<OpFoldResult> wgOffsets = op.getMixedOffsets();
 
     for (auto tdescOffsets : *maybeTdescOffsets) {
-      SmallVector<OpFoldResult> sgTileOffsets;
+      SmallVector<OpFoldResult> sgOffsets;
       size_t rank = tdescOffsets.size();
       for (size_t i = 0; i < rank; i++) {
-        size_t idx = wgTileOffsets.size() - rank + i;
+        size_t idx = wgOffsets.size() - rank + i;
         Value add = rewriter.createOrFold<index::AddOp>(
             loc, tdescOffsets[i],
-            getValueOrCreateConstantIndexOp(rewriter, loc, wgTileOffsets[idx]));
-        sgTileOffsets.push_back(add);
+            getValueOrCreateConstantIndexOp(rewriter, loc, wgOffsets[idx]));
+        sgOffsets.push_back(add);
       }
 
       auto newOp = xegpu::CreateNdDescOp::create(
-          rewriter, loc, newTdescTy, op.getSource(), sgTileOffsets,
+          rewriter, loc, newTdescTy, op.getSource(), sgOffsets,
           op.getMixedSizes(), op.getMixedStrides());
       newCreateNdOps.push_back(newOp);
     }

>From 1b165521b29e4b595182160f5ffd94340f653c2c Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Aug 2025 15:07:01 +0000
Subject: [PATCH 28/29] address comments

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 36 ++++++++++---------
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 27 ++++++++++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78b3cbdedecf8..35fbe2edd2b2c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -39,30 +39,32 @@ void XeGPUDialect::initialize() {
 
 /// Generates instructions to compute offsets for a subgroup identified by
 /// its multidimensional indices (sgId), using the specified subgroup layout
-/// (sgLayout), subgroup data dimensions (sgShape), and the overall data
-/// dimensions (shape).
+/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data
+/// dimensions (sizePerWg).
 static SmallVector<SmallVector<Value>>
-genOffsetsComputations(OpBuilder &builder, Location loc,
-                       SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
-                       ArrayRef<int64_t> sgShape, ArrayRef<int64_t> shape) {
+genOffsetsComputingInsts(OpBuilder &builder, Location loc,
+                         SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
+                         ArrayRef<int64_t> sizePerSg,
+                         ArrayRef<int64_t> sizePerWg) {
 
   SmallVector<SmallVector<Value>> offsets;
 
-  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i]
   SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgId, sgShape), [&](const auto &t) -> Value {
+      llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value {
         return builder.createOrFold<index::MulOp>(
             loc, std::get<0>(t),
             builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
-  // distUnit[i] is the minimum value between shape[i] and
-  // sgLayout[i] * sgShape[i]
+  // distUnit[i] is the minimum value between sizePerWg[i] and
+  // sgLayout[i] * sizePerSg[i]
   SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)),
       [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
 
-  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+  for (SmallVector<int64_t> unitOffs :
+       StaticTileOffsetRange(sizePerWg, distUnit)) {
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
           return builder.create<arith::ConstantIndexOp>(loc, d);
@@ -75,7 +77,7 @@ genOffsetsComputations(OpBuilder &builder, Location loc,
         });
 
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
@@ -300,8 +302,8 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<int64_t> sgShape;
   if (auto maybeSgShape = getSgDataAsInt())
     sgShape = maybeSgShape.value();
-  else if (auto ratio = computeShapeRatio(shape, sgLayout))
-    sgShape = ratio.value();
+  else if (auto derivedShape = computeShapeRatio(shape, sgLayout))
+    sgShape = derivedShape.value();
   else
     return failure();
 
@@ -311,7 +313,8 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
     return failure();
   SmallVector<Value> sgIds = *maybeIds;
 
-  return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
+  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
+                                  shape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -401,7 +404,8 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<Value> sgIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  return genOffsetsComputations(builder, loc, sgIds, sgLayout, sgShape, shape);
+  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
+                                  shape);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 628a4857d1253..fadd4aeba9bec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,5 +1,8 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
+#map = affine_map<()[s0] -> (s0 floordiv 4)>
+#map1 = affine_map<()[s0] -> (s0 mod 4)>
+
 gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -12,6 +15,30 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
+  // CHECK-LABEL: create_nd_tdesc_with_shared_data
+  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
+    //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
+    //CHECK: [[C16:%.+]] = arith.constant 16 : index
+    //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
+    //CHECK: [[C64:%.+]] = arith.constant 64 : index
+    //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
+    //CHECK: [[C0:%.+]] = arith.constant 0 : index
+    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
+    //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+    //CHECK: [[C128:%.+]] = arith.constant 128 : index
+    //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+    //CHECK: [[C128_2:%.+]] = arith.constant 128 : index
+    //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C128_2]]
+    //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
+    gpu.return
+  }
+
   // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {

>From 0511e1bfa4b7c3f206655b55f7bbca3368837576 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 8 Aug 2025 16:03:35 +0000
Subject: [PATCH 29/29] cleanup

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp     | 4 ++--
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 35fbe2edd2b2c..d997296a22c20 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -388,8 +388,8 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<int64_t> sgShape;
   if (auto maybeSgShape = getSgDataAsInt())
     sgShape = maybeSgShape.value();
-  else if (auto ratio = computeShapeRatio(shape, sgLayout))
-    sgShape = ratio.value();
+  else if (auto derivedShape = computeShapeRatio(shape, sgLayout))
+    sgShape = derivedShape.value();
   else
     return failure();
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index fadd4aeba9bec..e5cc65e6bd3d7 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -31,11 +31,11 @@ gpu.module @test_round_robin_assignment {
     //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
     //CHECK: [[C128:%.+]] = arith.constant 128 : index
     //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
-    //CHECK: [[C128_2:%.+]] = arith.constant 128 : index
-    //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C128_2]]
+    //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
+    //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
     //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-      -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
+      -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
     gpu.return
   }