[Mlir-commits] [mlir] [mlir][xegpu] Add definition of SliceAttr (PR #150146)

Chao Chen llvmlistbot at llvm.org
Fri Jul 25 15:51:21 PDT 2025


https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/150146

>From 2bc70b6a8487a8ce0f0e7e0c5ac5bc59035465ab Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 19:46:04 +0000
Subject: [PATCH 01/17] add definition draft of SliceAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 42b5b7a0d4e3f..abbd227b9905f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -330,4 +330,25 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
   let genVerifyDecl = 1;
 }
 
+
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+  let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
+
+  let description = [{
+    Like LayoutAttr, SliceAttr describes data distribution among subgroups or work-items.
+    However, whereas LayoutAttr requires the data to have the same rank as the attribute,
+    SliceAttr permits the data to have a lower rank. In this case, compute units in the
+    specified dimensions share the data, provided that the remaining ranks match the data
+    rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
+    vector.broadcast.
+  }];
+
+  let parameters = (ins
+    "Attribute": $parent,
+    "DenseI64ArrayAttr": $dims
+  );
+
+  let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

>From 3959f9e5027f7c21f420c44a5e34501c115df361 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 21:02:22 +0000
Subject: [PATCH 02/17] add layout traits

---
 mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt |  6 ++++++
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h        |  1 +
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td  | 11 +++++++++--
 mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt          |  1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp            |  1 +
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index 3f8cac4dc07c3..bbbeb71410a9b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -12,3 +12,9 @@ mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
 mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRXeGPUEnumsIncGen)
 add_dependencies(mlir-headers MLIRXeGPUEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
+mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
+mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
+add_public_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
+add_dependencies(mlir-headers MLIRXeGPUAttrInterfaceIncGen)
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 8e2784f40ad39..cc8d58d8975b4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -25,6 +25,7 @@ class TensorDescType;
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index abbd227b9905f..b15dd4a3177f9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -169,7 +169,14 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
+def LayoutTrait: AttrInterface<"LayoutTrait"> {
+  let cppNamespace = "::mlir::xegpu";
+  let description = [{
+    Common trait for all XeGPU layouts.
+  }];
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
   let summary = [{
     Describes the data distribution to subgroups and work-items for a tensor
     specified by the tensor descriptor.
@@ -331,7 +338,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
 }
 
 
-def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice"> {
+def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   let summary = [{Describes the data distribution and sharing among subgroups or work-items.}];
 
   let description = [{
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 242a97ccfdf6d..89d986143e965 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
 
   DEPENDS
   MLIRXeGPUIncGen
+  MLIRXeGPUAttrInterfaceIncGen
   MLIRXeGPUAttrsIncGen
   MLIRXeGPUEnumsIncGen
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 78cbf884a1911..63160c98105c3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -753,6 +753,7 @@ LogicalResult ConvertLayoutOp::verify() {
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>

>From 2027cfc98321d8f68a713340cd652ab10625cfee Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 23:46:10 +0000
Subject: [PATCH 03/17] add verifier and interface

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 54 ++++++++++++++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 21 ++++++++
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index b15dd4a3177f9..e3b06714bdcc2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -174,6 +174,17 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
   let description = [{
     Common trait for all XeGPU layouts.
   }];
+
+  let methods = [
+    InterfaceMethod<"Get the effective sg layout",
+                    "std::optional<llvm::SmallVector<int>>",
+                    "getEffectiveSgLayout">,
+    InterfaceMethod<"Get the effective sg data",
+                    "std::optional<llvm::SmallVector<int>>",
+                    "getEffectiveSgData">,
+  ];
+
+
 }
 
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -331,6 +342,18 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
       return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr,
                              getLaneLayout(), getLaneData(), getOrder());
     }
+
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+      if (DenseI32ArrayAttr layout = getSgLayout())
+        return llvm::to_vector(layout.asArrayRef());
+      return std::nullopt;
+    }
+
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+      if (DenseI32ArrayAttr data = getSgData())
+        return llvm::to_vector(data.asArrayRef());
+      return std::nullopt;
+    }
   }];
 
   let assemblyFormat = "`<` struct(params) `>`";
@@ -351,11 +374,40 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   }];
 
   let parameters = (ins
-    "Attribute": $parent,
+    "xegpu::LayoutAttr": $parent,
     "DenseI64ArrayAttr": $dims
   );
 
+  let extraClassDeclaration = [{
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+      if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
+        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+        llvm::SmallVector<int32_t> result;
+        for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
+          if (!llvm::is_contained(dims, i))
+            result.push_back(v);
+        }
+        return result;
+      }
+      return std::nullopt;
+    }
+    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+      if (DenseI32ArrayAttr data = getParent().getSgData()) {
+        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
+        llvm::SmallVector<int32_t> result;
+        for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
+          if (!llvm::is_contained(dims, i))
+            result.push_back(v);
+        }
+        return result;
+      }
+      return std::nullopt;
+
+    }
+  }];
+
   let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+  let genVerifyDecl = 1;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 642c393cbc2c8..7e293b6f0e1a3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -206,6 +206,27 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_SliceAttr
+//===----------------------------------------------------------------------===//
+LogicalResult
+SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+                  xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
+  if (!parent || !dims)
+    return emitError() << "expected parent layout and dims attribute";
+
+  int rank = parent.getRank();
+  // check every element in dims is unique and smaller than rank
+  llvm::SmallDenseSet<int64_t> seen;
+  for (int64_t dim : dims.asArrayRef()) {
+    if (dim >= rank)
+      return emitError() << "invalid dim: " << dim;
+    if (!seen.insert(dim).second)
+      return emitError() << "repeated dim: " << dim;
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//

>From 638c0853dc2b76fbc01d8410cd6bb52aa7d20891 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 15:52:26 +0000
Subject: [PATCH 04/17] add invalid unit test

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  2 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  4 ++--
 mlir/test/Dialect/XeGPU/invalid.mlir          | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index e3b06714bdcc2..d0b2e936d6508 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -406,7 +406,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     }
   }];
 
-  let assemblyFormat = "`<` $parent `,` `dim` `=` $dims `>`";
+  let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
   let genVerifyDecl = 1;
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7e293b6f0e1a3..21007f98643bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -220,9 +220,9 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   llvm::SmallDenseSet<int64_t> seen;
   for (int64_t dim : dims.asArrayRef()) {
     if (dim >= rank)
-      return emitError() << "invalid dim: " << dim;
+      return emitError() << "invalid dim (" << dim << ") in slice attribute.";
     if (!seen.insert(dim).second)
-      return emitError() << "repeated dim: " << dim;
+      return emitError() << "repeated dim (" << dim << ") in slice attribute.";
   }
   return success();
 }
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index eb564d55bfd51..c4e72820e9aec 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -658,3 +658,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
         #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
   return
 }
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{repeated dim (2) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [2, 2]>
+func.func @slice_attr_repeat_dim() {
+  %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+  return
+}
+
+// -----
+#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
+// expected-error at +1 {{invalid dim (3) in slice attribute}}
+#s = #xegpu.slice<#l, dims = [3]>
+func.func @slice_attr_repeat_dim() {
+  %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
+  return
+}
+

>From 91048f06417bd8af3d58d35a516115da044e6451 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 16:06:59 +0000
Subject: [PATCH 05/17] add wrappers

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index d0b2e936d6508..a38878bc6a61f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,8 +183,6 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "std::optional<llvm::SmallVector<int>>",
                     "getEffectiveSgData">,
   ];
-
-
 }
 
 def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
@@ -402,7 +400,18 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
         return result;
       }
       return std::nullopt;
+    }
+
+    DenseI32ArrayAttr getOrder() const {
+      return getParent().getOrder();
+    }
+
+    bool isWgLayout() const {
+      return getParent().isWgLayout();
+    }
 
+    bool isSgLayout() const {
+      return getParent().isSgLayout();
     }
   }];
 

>From ddc42c2886ae3c49f10032caea27817dc6d542de Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 17:51:42 +0000
Subject: [PATCH 06/17] update description

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 78a7c48af837e..8644be8e4204c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,7 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
                     "std::optional<llvm::SmallVector<int>>",
-                    "getEffectiveSgData">,
+                    "getEffectiveSgData">
   ];
 }
 
@@ -375,6 +375,16 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     specified dimensions share the data, provided that the remaining ranks match the data
     rank. SliceAttr is commonly used by operations such as vector.multi_reduction and
     vector.broadcast.
+
+    Example:
+    ```
+    #l = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+    #r = #xegpu.slice<#l, dim = 0>
+
+    %exp = math.exp %input {layout_result_0 = #l}: vector<256x128xf32>
+    %red = vector.multi_reduction<add>, %exp, %acc [0] {layout_result_0 = #r}: vector<256x128xf32> to vector<128xf32>
+    %bcast = vector.broadcast %red {layout_result_0 = #l} : vector<128xf32> to vector<256x128xf32>
+    ```
   }];
 
   let parameters = (ins

>From 36e2c3a118b0167c6e4f3341533f92353ddaebe2 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 18:44:08 +0000
Subject: [PATCH 07/17] refactor

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h        |  6 +++---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td  | 15 +++------------
 .../include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 12 ++++++++++++
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index cc8d58d8975b4..c2d546fa08fe0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -22,18 +22,18 @@
 namespace mlir {
 namespace xegpu {
 class TensorDescType;
+class LayoutAttr;
 } // namespace xegpu
 } // namespace mlir
 
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
 #define GET_TYPEDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc>
-
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.h.inc>
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 8644be8e4204c..36a12a2c2a029 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -396,24 +396,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
       if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
         llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        llvm::SmallVector<int32_t> result;
-        for (auto [i, v]: llvm::enumerate(layout.asArrayRef())) {
-          if (!llvm::is_contained(dims, i))
-            result.push_back(v);
-        }
-        return result;
+        return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
       }
       return std::nullopt;
     }
+
     std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
       if (DenseI32ArrayAttr data = getParent().getSgData()) {
         llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        llvm::SmallVector<int32_t> result;
-        for (auto [i, v]: llvm::enumerate(data.asArrayRef())) {
-          if (!llvm::is_contained(dims, i))
-            result.push_back(v);
-        }
-        return result;
+        return XeGPUDialect::dropDims(data.asArrayRef(), dims);
       }
       return std::nullopt;
     }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 549018b61d6fb..f07a758a59b96 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -41,6 +41,18 @@ def XeGPU_Dialect : Dialect {
       /// Checks if the given shape can be evenly distributed based on the layout
       /// and data factors provided by the LayoutAttr.
       static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::LayoutAttr attr);
+
+      /// drops the data in the specified dimension, and return the rest. e.g.,
+      /// for data = [32, 64, 8], dropPositions = [0, 2], it will return [64]
+      template<typename T, typename U>
+      static llvm::SmallVector<T> dropDims(llvm::ArrayRef<T> data, llvm::ArrayRef<U> dropPositions) {
+        llvm::SmallVector<T> result;
+        for (auto [i, v]: llvm::enumerate(data)) {
+          if (!llvm::is_contained(dropPositions, i))
+            result.push_back(v);
+        }
+        return result;
+      }
     }];
 }
 

>From 6872e6dbda83d21d960ffb2c5156e89b1381fdfd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 20:26:39 +0000
Subject: [PATCH 08/17] add delinearizeSubgroupId interface

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |  1 +
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 13 ++++++++++++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 19 +++++++++++++++++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  2 ++
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  2 +-
 5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index c2d546fa08fe0..57919966a90b2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 36a12a2c2a029..96466550cb703 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -187,7 +187,11 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
                     "std::optional<llvm::SmallVector<int>>",
-                    "getEffectiveSgData">
+                    "getEffectiveSgData">,
+    InterfaceMethod<"Delinearize the Subgroup Id",
+                    "FailureOr<SmallVector<Value>>",
+                    "delinearizeSubgroupId",
+                    (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
   ];
 }
 
@@ -358,6 +362,10 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
         return llvm::to_vector(data.asArrayRef());
       return std::nullopt;
     }
+
+    FailureOr<SmallVector<Value>>
+    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
   }];
 
   let assemblyFormat = "`<` struct(params) `>`";
@@ -409,6 +417,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
       return std::nullopt;
     }
 
+    FailureOr<llvm::SmallVector<Value>>
+    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
     }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 836478a807761..974e42140e54e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -211,6 +212,18 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+FailureOr<SmallVector<Value>>
+LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
+                                  OpBuilder &builder) {
+  assert(isWgLayout() && "delinearizeSubgroupId is only available for "
+                         "workgroup-level layout attribute.");
+  auto dims =
+      llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
+        return arith::ConstantIndexOp::create(builder, loc, d);
+      });
+  return affine::delinearizeIndex(builder, loc, linearId, dims);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -232,6 +245,12 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   return success();
 }
 
+FailureOr<SmallVector<Value>>
+SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
+                                 OpBuilder &builder) {
+  return getParent().delinearizeSubgroupId(linearId, loc, builder);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index edc18025136ac..a7013ed470cab 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -838,7 +838,9 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 } // namespace xegpu
 } // namespace mlir
 
+namespace mlir {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
+} // namespace mlir
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc>
 #define GET_OP_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc>
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index ef52323a9f46b..2168d43eb701b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -175,7 +175,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     }
 
     auto deLinearizeSgId =
-        affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
+        layout.delinearizeSubgroupId(linearSgId, loc, rewriter);
     if (failed(deLinearizeSgId))
       return failure();
     SmallVector<Value> sgIds = *deLinearizeSgId;

>From 223fab912e9987e7a7ed7440fb6fd42b2d0a4dd8 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 21:05:46 +0000
Subject: [PATCH 09/17] fix format

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 57919966a90b2..eb74b8142688f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -27,8 +27,8 @@ class LayoutAttr;
 } // namespace xegpu
 } // namespace mlir
 
-#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 
 #define GET_ATTRDEF_CLASSES

>From 60e20a02b991a4276f74937ea69c483d780d2e49 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 24 Jul 2025 23:33:27 +0000
Subject: [PATCH 10/17] add impl of getOffsets for LayoutAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 65 +++++++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 80 +++++++++++++++++--
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  2 +-
 3 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 94a294fdc5705..5794f786dc9b9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -183,15 +183,20 @@ def LayoutTrait: AttrInterface<"LayoutTrait"> {
 
   let methods = [
     InterfaceMethod<"Get the effective sg layout",
-                    "std::optional<llvm::SmallVector<int>>",
+                    "std::optional<SmallVector<int64_t>>",
                     "getEffectiveSgLayout">,
     InterfaceMethod<"Get the effective sg data",
-                    "std::optional<llvm::SmallVector<int>>",
+                    "std::optional<SmallVector<int64_t>>",
                     "getEffectiveSgData">,
     InterfaceMethod<"Delinearize the Subgroup Id",
                     "FailureOr<SmallVector<Value>>",
                     "delinearizeSubgroupId",
-                    (ins "Value":$linearId, "Location":$loc, "OpBuilder &": $builder)>
+                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
+
+    InterfaceMethod<"Get the local offset to be accessed by the given subgroup Id",
+                    "FailureOr<SmallVector<SmallVector<Value>>>",
+                    "getOffsets",
+                    (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>
   ];
 }
 
@@ -351,20 +356,23 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [LayoutTrait]> {
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
+    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
       if (DenseI32ArrayAttr layout = getSgLayout())
-        return llvm::to_vector(layout.asArrayRef());
+        return llvm::to_vector_of<int64_t>(layout.asArrayRef());
       return std::nullopt;
     }
 
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
+    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
       if (DenseI32ArrayAttr data = getSgData())
-        return llvm::to_vector(data.asArrayRef());
+        return llvm::to_vector_of<int64_t>(data.asArrayRef());
       return std::nullopt;
     }
 
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
+    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+    FailureOr<SmallVector<SmallVector<Value>>>
+    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
   }];
 
@@ -401,24 +409,6 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
   );
 
   let extraClassDeclaration = [{
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgLayout() const {
-      if (DenseI32ArrayAttr layout = getParent().getSgLayout()) {
-        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(layout.asArrayRef(), dims);
-      }
-      return std::nullopt;
-    }
-
-    std::optional<llvm::SmallVector<int32_t>> getEffectiveSgData() const {
-      if (DenseI32ArrayAttr data = getParent().getSgData()) {
-        llvm::ArrayRef<int64_t> dims = getDims().asArrayRef();
-        return XeGPUDialect::dropDims(data.asArrayRef(), dims);
-      }
-      return std::nullopt;
-    }
-
-    FailureOr<llvm::SmallVector<Value>>
-    delinearizeSubgroupId(Value linearId, Location loc, OpBuilder &builder);
 
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
@@ -431,6 +421,29 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
     bool isSgLayout() const {
       return getParent().isSgLayout();
     }
+
+    std::optional<SmallVector<int64_t>> getEffectiveSgLayout() const {
+      if (auto layout = getParent().getEffectiveSgLayout()) {
+        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*layout), dims);
+      }
+      return std::nullopt;
+    }
+
+    std::optional<SmallVector<int64_t>> getEffectiveSgData() const {
+      if (auto data = getParent().getEffectiveSgData()) {
+        ArrayRef<int64_t> dims = getDims().asArrayRef();
+        return XeGPUDialect::dropDims(llvm::ArrayRef<int64_t>(*data), dims);
+      }
+      return std::nullopt;
+    }
+
+    FailureOr<SmallVector<Value>>
+    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+
+    FailureOr<SmallVector<SmallVector<Value>>>
+    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+
   }];
 
   let assemblyFormat = "`<` $parent `,` `dims` `=` $dims `>`";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 91d7b2a137efd..682f0620dbcfb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
@@ -213,17 +215,75 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(Value linearId, Location loc,
-                                  OpBuilder &builder) {
-  assert(isWgLayout() && "delinearizeSubgroupId is only available for "
-                         "workgroup-level layout attribute.");
+LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+                                  Value linearId) {
+  // delinearizeSubgroupId is only available for workgroup-level layout
+  // attribute
+  if (!isWgLayout())
+    return failure();
+
   auto dims =
       llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
         return arith::ConstantIndexOp::create(builder, loc, d);
       });
+
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
 
+FailureOr<SmallVector<SmallVector<Value>>>
+LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+                       ArrayRef<int64_t> shape) {
+  if (!isWgLayout())
+    return failure();
+
+  auto sgLayout = getEffectiveSgLayout().value();
+  SmallVector<int64_t> sgShape;
+  if (auto maybeSgShape = getEffectiveSgData())
+    sgShape = maybeSgShape.value();
+  else if (auto ratio = computeShapeRatio(shape, sgLayout))
+    sgShape = ratio.value();
+  else
+    return failure();
+
+  // distUnit[i] is the minimum value between shape[i] and
+  // sgLayout[i] * sgShape[i]
+  SmallVector<int64_t> distUnit = llvm::map_to_vector(
+      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+  // delinearize Ids
+  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  if (failed(maybeIds))
+    return failure();
+  SmallVector<Value> sgIds = *maybeIds;
+
+  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  SmallVector<Value> localOffsets = llvm::map_to_vector(
+      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+        auto &[id, s] = t;
+        Value d = arith::ConstantIndexOp::create(builder, loc, s);
+        return index::MulOp::create(builder, loc, id, d);
+      });
+
+  SmallVector<SmallVector<Value>> offsets;
+  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+    SmallVector<Value> base =
+        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+          return arith::ConstantIndexOp::create(builder, loc, d);
+        });
+
+    SmallVector<Value> adds = llvm::map_to_vector(
+        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+          return arith::AddIOp::create(builder, loc, std::get<0>(t),
+                                       std::get<1>(t));
+        });
+
+    offsets.push_back(adds);
+  }
+
+  return offsets;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -246,9 +306,15 @@ SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(Value linearId, Location loc,
-                                 OpBuilder &builder) {
-  return getParent().delinearizeSubgroupId(linearId, loc, builder);
+SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
+                                 Value linearId) {
+  return getParent().delinearizeSubgroupId(builder, loc, linearId);
+}
+
+FailureOr<SmallVector<SmallVector<Value>>>
+SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
+                      ArrayRef<int64_t> shape) {
+  return failure();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f914914dc6b9f..e3cf5473076e7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -213,7 +213,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     }
 
     auto deLinearizeSgId =
-        layout.delinearizeSubgroupId(adjustedSgId, loc, rewriter);
+        layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
     if (failed(deLinearizeSgId))
       return failure();
     SmallVector<Value> sgIds = *deLinearizeSgId;

>From 3630966307810ff8ee47aa7d95328ebba225724e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:25:52 +0000
Subject: [PATCH 11/17] apply getOffsets in CreateNdDescOp

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 29 +++++++-----
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 44 +++++++++----------
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 42 +++++++++---------
 3 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 682f0620dbcfb..0b5ecfc210281 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -217,14 +217,14 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 FailureOr<SmallVector<Value>>
 LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
                                   Value linearId) {
-  // delinearizeSubgroupId is only available for workgroup-level layout
-  // attribute
+  // delinearizeSubgroupId is only available for
+  // workgroup-level layout attribute
   if (!isWgLayout())
     return failure();
 
   auto dims =
-      llvm::map_to_vector(getSgLayout().asArrayRef(), [&](int32_t d) -> Value {
-        return arith::ConstantIndexOp::create(builder, loc, d);
+      llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
+        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
       });
 
   return affine::delinearizeIndex(builder, loc, linearId, dims);
@@ -260,25 +260,32 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
   SmallVector<Value> localOffsets = llvm::map_to_vector(
       llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
-        auto &[id, s] = t;
-        Value d = arith::ConstantIndexOp::create(builder, loc, s);
-        return index::MulOp::create(builder, loc, id, d);
+        return builder.createOrFold<index::MulOp>(
+            loc, std::get<0>(t),
+            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
   SmallVector<SmallVector<Value>> offsets;
   for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
-          return arith::ConstantIndexOp::create(builder, loc, d);
+          return builder.create<arith::ConstantIndexOp>(loc, d);
         });
 
     SmallVector<Value> adds = llvm::map_to_vector(
         llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return arith::AddIOp::create(builder, loc, std::get<0>(t),
-                                       std::get<1>(t));
+          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+                                                     std::get<1>(t));
         });
 
-    offsets.push_back(adds);
+    SmallVector<Value> mods = llvm::map_to_vector(
+        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+          return builder.createOrFold<index::RemUOp>(
+              loc, std::get<0>(t),
+              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+        });
+
+    offsets.push_back(mods);
   }
 
   return offsets;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index e3cf5473076e7..af55f176cb84f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -212,39 +212,39 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
           rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
     }
 
-    auto deLinearizeSgId =
-        layout.delinearizeSubgroupId(rewriter, loc, adjustedSgId);
-    if (failed(deLinearizeSgId))
+    auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
+    if (failed(tdescOffsets))
       return failure();
-    SmallVector<Value> sgIds = *deLinearizeSgId;
-
-    // Calculate distribution unit shape and local offsets for subgroup
-    SmallVector<int64_t> distUnitShape(sgLayout.size());
-    SmallVector<Value> localOffset(sgLayout.size());
-    for (size_t i = 0; i < sgLayout.size(); i++) {
-      distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], wgShape[i]);
-      localOffset[i] =
-          rewriter.createOrFold<index::MulOp>(loc, sgIds[i], sgDataDim[i]);
-    }
-
-    SmallVector<OpFoldResult> originalOffsets = op.getMixedOffsets();
 
     xegpu::TensorDescType newTdescTy =
         xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
                                    layout.dropSgLayoutAndData());
+
     SmallVector<Value> newCreateNdOps;
-    for (SmallVector<int64_t> distUnitBaseAddr :
-         StaticTileOffsetRange(wgShape, distUnitShape)) {
-      SmallVector<OpFoldResult> globalOffsets =
-          calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
-                                 distUnitBaseAddr, distUnitShape);
+    SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+
+    for (auto tdescOffset : *tdescOffsets) {
+      SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
+          llvm::zip_longest(tdescOffset, offset),
+          [&](const auto &t) -> OpFoldResult {
+            std::optional<Value> off = std::get<0>(t);
+            std::optional<OpFoldResult> old = std::get<1>(t);
+            if (!off.has_value())
+              return *old;
+
+            if (!old.has_value() || isZeroInteger(*old))
+              return *off;
+
+            return rewriter.createOrFold<index::AddOp>(
+                loc, *off,
+                getValueOrCreateConstantIndexOp(rewriter, loc, *old));
+          });
 
       auto newCreateNdOp = xegpu::CreateNdDescOp::create(
-          rewriter, loc, newTdescTy, op.getSource(), globalOffsets,
+          rewriter, loc, newTdescTy, op.getSource(), newOffsets,
           op.getMixedSizes(), op.getMixedStrides());
       newCreateNdOps.push_back(newCreateNdOp);
     }
-
     rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
     return success();
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index d51122417fb61..5e6a227e92320 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -4,27 +4,25 @@
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
+  // CHECK-SAME: [[ARG_0:%.*]]: memref<24x32xf32>
   gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
-  // CHECK: %[[SGID:.*]] = gpu.subgroup_id
-  // CHECK: %[[C12:.*]] = arith.constant 12 : index
-  // CHECK: %[[C4:.*]] = arith.constant 4 : index
-  // CHECK: %[[C8:.*]] = arith.constant 8 : index
-  // CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
-  // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
-  // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
-  // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
-  // CHECK: %[[C24:.*]] = arith.constant 24 : index
-  // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
-  // CHECK: %[[C32:.*]] = arith.constant 32 : index
-  // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
-  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
-  // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
-  // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
-  // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
-  // CHECK: gpu.return
+  //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+  //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+  //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+  //CHECK: [[C12:%.+]] = arith.constant 12 : index
+  //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C12]]
+  //CHECK: [[C8:%.+]] = arith.constant 8 : index
+  //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C8]]
+  //CHECK: [[C0:%.+]] = arith.constant 0 : index
+  //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
+  //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+  //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
+  //CHECK: [[C24:%.+]] = arith.constant 24 : index
+  //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C24]]
+  //CHECK: [[C32:%.+]] = arith.constant 32 : index
+  //CHECK: [[X:%.+]] = index.remu [[UX]], [[C32]]
+  //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<24x32xf32> -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
+
   %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
     -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
   gpu.return
@@ -180,7 +178,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       -> vector<24x1xf32>
     // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
     // CHECK-SAME: : vector<12x1xf32> to vector<12x8xf32>
-    %broadcast = vector.broadcast %load 
+    %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [2, 1], sg_data = [12, 8], lane_layout = [2, 1], lane_data = [1, 1]>}
       : vector<24x1xf32> to vector<24x8xf32>
     gpu.return
@@ -367,7 +365,7 @@ gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
   // CHECK-LABEL: @subgroup_id_range_nested_if
   gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
     %sg_id = gpu.subgroup_id : index
-    %c1 = arith.constant 1 : i1 
+    %c1 = arith.constant 1 : i1
     %c3 = arith.constant 3 : index
     %c32 = arith.constant 32 : index
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>

>From 398d69beac1e69ef72f23dea5b5649e4dc9a0ffd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 01:32:43 +0000
Subject: [PATCH 12/17] cleanup

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  1 +
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 59 +++----------------
 2 files changed, 8 insertions(+), 52 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0b5ecfc210281..ef336ce800385 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -222,6 +222,7 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
   if (!isWgLayout())
     return failure();
 
+  // TODO: handle order attribute
   auto dims =
       llvm::map_to_vector(*getEffectiveSgLayout(), [&](int64_t d) -> Value {
         return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af55f176cb84f..640d74d3e3715 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -125,39 +125,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
 struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
-  // Calculate offset for each subgroup
-  static SmallVector<OpFoldResult>
-  calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
-                         const SmallVector<OpFoldResult> &originalOffsets,
-                         const SmallVector<Value> &localOffset,
-                         const SmallVector<int64_t> &distUnitBaseAddr,
-                         const SmallVector<int64_t> &distUnitShape) {
-    assert(localOffset.size() == distUnitBaseAddr.size() &&
-           "localOffset and distUnitBaseAddr must have the same rank");
-
-    SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
-                                            originalOffsets.end());
-    size_t rank = localOffset.size();
-    for (size_t i = 0; i < rank; ++i) {
-      size_t dimIdx = originalOffsets.size() - rank + i;
-      Value constOffset =
-          arith::ConstantIndexOp::create(rewriter, loc, distUnitBaseAddr[i]);
-      Value offset =
-          rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
-      Value modValue =
-          arith::ConstantIndexOp::create(rewriter, loc, distUnitShape[i]);
-      Value offsetMod =
-          rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
-      Value origOffset = getValueOrCreateConstantIndexOp(
-          rewriter, loc, originalOffsets[dimIdx]);
-      Value globalOffset =
-          rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
-      globalOffsets[dimIdx] = globalOffset;
-    }
-
-    return globalOffsets;
-  }
-
   LogicalResult
   matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -177,28 +144,14 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       return rewriter.notifyMatchFailure(
           op, "sgLayout attribute is required in layout");
 
-    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
-
-    // TODO : Handle order attribute
     // Get the subgroup ID
-    auto linearSgId =
+    Value linearSgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
-    // Create constants for layout dimensions
-    SmallVector<Value> sgLayoutDim(sgLayout.size());
-    SmallVector<Value> sgDataDim(sgShape.size());
-
-    for (size_t i = 0; i < sgLayout.size(); i++) {
-      sgLayoutDim[i] =
-          arith::ConstantIndexOp::create(rewriter, loc, sgLayout[i]);
-      sgDataDim[i] = arith::ConstantIndexOp::create(rewriter, loc, sgShape[i]);
-    }
-
     int64_t startOfRange = -1, endOfRange = -1;
     bool sgIdRangeSpecified =
         isSgIdRangeSpecified(op, startOfRange, endOfRange);
 
-    Value adjustedSgId = linearSgId;
     if (sgIdRangeSpecified) {
       int64_t sgCount = endOfRange - startOfRange;
       if (computeProduct(sgLayout) != sgCount)
@@ -208,14 +161,16 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       // sg id
       Value startOfRangeVal =
           rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
-      adjustedSgId =
+      linearSgId =
           rewriter.createOrFold<index::SubOp>(loc, linearSgId, startOfRangeVal);
     }
 
-    auto tdescOffsets = layout.getOffsets(rewriter, loc, adjustedSgId, wgShape);
-    if (failed(tdescOffsets))
+    auto maybeTdescOffsets =
+        layout.getOffsets(rewriter, loc, linearSgId, wgShape);
+    if (failed(maybeTdescOffsets))
       return failure();
 
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
     xegpu::TensorDescType newTdescTy =
         xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
                                    layout.dropSgLayoutAndData());
@@ -223,7 +178,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     SmallVector<Value> newCreateNdOps;
     SmallVector<OpFoldResult> offset = op.getMixedOffsets();
 
-    for (auto tdescOffset : *tdescOffsets) {
+    for (auto tdescOffset : *maybeTdescOffsets) {
       SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
           llvm::zip_longest(tdescOffset, offset),
           [&](const auto &t) -> OpFoldResult {

>From 08e4aa9c6df06e5d7eec54c63c96877dcc1631ac Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:28:40 +0000
Subject: [PATCH 13/17] fix a bug

---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 640d74d3e3715..688e2b25867b3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -179,26 +179,20 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     SmallVector<OpFoldResult> offset = op.getMixedOffsets();
 
     for (auto tdescOffset : *maybeTdescOffsets) {
-      SmallVector<OpFoldResult> newOffsets = llvm::map_to_vector(
-          llvm::zip_longest(tdescOffset, offset),
-          [&](const auto &t) -> OpFoldResult {
-            std::optional<Value> off = std::get<0>(t);
-            std::optional<OpFoldResult> old = std::get<1>(t);
-            if (!off.has_value())
-              return *old;
-
-            if (!old.has_value() || isZeroInteger(*old))
-              return *off;
-
-            return rewriter.createOrFold<index::AddOp>(
-                loc, *off,
-                getValueOrCreateConstantIndexOp(rewriter, loc, *old));
-          });
-
-      auto newCreateNdOp = xegpu::CreateNdDescOp::create(
+      SmallVector<OpFoldResult> newOffsets;
+      size_t rank = tdescOffset.size();
+      for (size_t i = 0; i < rank; i++) {
+        size_t idx = offset.size() - rank + i;
+        Value newOff = rewriter.createOrFold<index::AddOp>(
+            loc, tdescOffset[i],
+            getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
+        newOffsets.push_back(newOff);
+      }
+
+      auto newOp = xegpu::CreateNdDescOp::create(
           rewriter, loc, newTdescTy, op.getSource(), newOffsets,
           op.getMixedSizes(), op.getMixedStrides());
-      newCreateNdOps.push_back(newCreateNdOp);
+      newCreateNdOps.push_back(newOp);
     }
     rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
     return success();

>From 62aa1dde2f1c47bf3d9b45582c668c33ef64a987 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 02:36:15 +0000
Subject: [PATCH 14/17] cleanup

---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 688e2b25867b3..dae1f06a8fbad 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -157,8 +157,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       if (computeProduct(sgLayout) != sgCount)
         return rewriter.notifyMatchFailure(
             op, "sg_layout size must match the sg_id_range");
-      // Subtract startOfRange from the original subgroup id to get the adjusted
-      // sg id
+      // Subtract startOfRange from the original subgroup id to get
+      // the adjusted sg id
       Value startOfRangeVal =
           rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
       linearSgId =
@@ -176,17 +176,17 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
                                    layout.dropSgLayoutAndData());
 
     SmallVector<Value> newCreateNdOps;
-    SmallVector<OpFoldResult> offset = op.getMixedOffsets();
+    SmallVector<OpFoldResult> oldOffsets = op.getMixedOffsets();
 
-    for (auto tdescOffset : *maybeTdescOffsets) {
+    for (auto tdescOffsets : *maybeTdescOffsets) {
       SmallVector<OpFoldResult> newOffsets;
-      size_t rank = tdescOffset.size();
+      size_t rank = tdescOffsets.size();
       for (size_t i = 0; i < rank; i++) {
-        size_t idx = offset.size() - rank + i;
-        Value newOff = rewriter.createOrFold<index::AddOp>(
-            loc, tdescOffset[i],
-            getValueOrCreateConstantIndexOp(rewriter, loc, offset[idx]));
-        newOffsets.push_back(newOff);
+        size_t idx = oldOffsets.size() - rank + i;
+        Value add = rewriter.createOrFold<index::AddOp>(
+            loc, tdescOffsets[i],
+            getValueOrCreateConstantIndexOp(rewriter, loc, oldOffsets[idx]));
+        newOffsets.push_back(add);
       }
 
       auto newOp = xegpu::CreateNdDescOp::create(

>From de0a1bbc63ac3eb04ae1e900a892dba8d03005f0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 17:18:09 +0000
Subject: [PATCH 15/17] add unit test

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |   4 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  65 ++++++++++-
 mlir/test/Dialect/XeGPU/layout.mlir           |   6 +
 .../Dialect/XeGPU/xegpu-attr-interface.mlir   |  23 ++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 107 ++++++++++++++++++
 5 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5794f786dc9b9..4f35e3ff061a4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -410,6 +410,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [LayoutTrait]> {
 
   let extraClassDeclaration = [{
 
+    int64_t getRank() const {
+      return getParent().getRank() - getDims().size();
+    }
+
     DenseI32ArrayAttr getOrder() const {
       return getParent().getOrder();
     }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ef336ce800385..fad3c6280fbbe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -296,7 +296,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
 LogicalResult
-SliceAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
                   xegpu::LayoutAttr parent, DenseI64ArrayAttr dims) {
   if (!parent || !dims)
     return emitError() << "expected parent layout and dims attribute";
@@ -322,7 +322,68 @@ SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
 FailureOr<SmallVector<SmallVector<Value>>>
 SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
                       ArrayRef<int64_t> shape) {
-  return failure();
+  assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
+  if (!isWgLayout())
+    return failure();
+
+  auto sgLayout = getEffectiveSgLayout().value();
+
+  SmallVector<int64_t> sgShape;
+  if (auto maybeSgShape = getEffectiveSgData())
+    sgShape = maybeSgShape.value();
+  else if (auto ratio = computeShapeRatio(shape, sgLayout))
+    sgShape = ratio.value();
+  else
+    return failure();
+
+  // distUnit[i] is the minimum value between shape[i] and
+  // sgLayout[i] * sgShape[i]
+  SmallVector<int64_t> distUnit = llvm::map_to_vector(
+      llvm::zip_equal(shape, computeElementwiseMul(sgLayout, sgShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
+
+  // delinearize Ids
+  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  if (failed(maybeIds))
+    return failure();
+  // The effective sgIds for offsets computing correspond
+  // to the dims that are not sliced.
+  ArrayRef<int64_t> dims = getDims().asArrayRef();
+  SmallVector<Value> sgIds =
+      XeGPUDialect::dropDims(ArrayRef<Value>(*maybeIds), dims);
+
+  // nd local offset, localOffset[i] = sgId[i] * sgShape[i]
+  SmallVector<Value> localOffsets = llvm::map_to_vector(
+      llvm::zip(sgIds, sgShape), [&](const auto &t) -> Value {
+        return builder.createOrFold<index::MulOp>(
+            loc, std::get<0>(t),
+            builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+      });
+
+  SmallVector<SmallVector<Value>> offsets;
+  for (SmallVector<int64_t> unitOffs : StaticTileOffsetRange(shape, distUnit)) {
+    SmallVector<Value> base =
+        llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
+          return builder.create<arith::ConstantIndexOp>(loc, d);
+        });
+
+    SmallVector<Value> adds = llvm::map_to_vector(
+        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
+          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
+                                                     std::get<1>(t));
+        });
+
+    SmallVector<Value> mods = llvm::map_to_vector(
+        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+          return builder.createOrFold<index::RemUOp>(
+              loc, std::get<0>(t),
+              builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
+        });
+
+    offsets.push_back(mods);
+  }
+
+  return offsets;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 017dacc8d629a..e5330951b065a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -50,4 +50,10 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
   gpu.return
 }
 
+gpu.func @slice_attr_repeat_dim() {
+  //CHECK: arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+  %cst = arith.constant {layout_result_0 = #xegpu.slice<<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
+  gpu.return
+}
+
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
new file mode 100644
index 0000000000000..6397b7fe525b8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
+
+#block = #xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>
+#slice = #xegpu.slice<#block, dims=[1]>
+
+//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
+gpu.module @test_1_1_assignment {
+  gpu.func @create_nd_tdesc() -> vector<128xindex> {
+    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
+    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    %step = vector.step {layout_result_0 = #slice}: vector<128xindex>
+    gpu.return %step : vector<128xindex>
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index f71fcf7ca297b..1e96280769060 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -7,11 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
@@ -149,12 +152,116 @@ struct TestXeGPUUnrollingPatterns
   }
 };
 
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "test-xegpu-layout-interface"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
+  using OpConversionPattern<vector::StepOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto layoutName = xegpu::getLayoutName(op->getResult(0));
+    auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+    if (!sliceAttr || sliceAttr.getRank() != 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> sgShape =
+        sliceAttr.getEffectiveSgData();
+    if (!sgShape)
+      return failure();
+
+    Location loc = op.getLoc();
+    VectorType type = op.getResult().getType();
+    auto wgShape = type.getShape();
+
+    Value sgId =
+        gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+    auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+    if (failed(maybeOffsets))
+      return failure();
+
+    VectorType newTy = type.cloneWith(*sgShape, type.getElementType());
+    Value base = vector::StepOp::create(rewriter, loc, newTy);
+    SmallVector<Value> newOps;
+    for (auto offsets : *maybeOffsets) {
+      Value bcast =
+          vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
+      Value add = arith::AddIOp::create(rewriter, loc, base, bcast);
+      newOps.push_back(add);
+    }
+    rewriter.replaceOpWithMultiple(op, {newOps});
+    return success();
+  }
+};
+
+struct TestXeGPULayoutInterface
+    : public PassWrapper<TestXeGPULayoutInterface,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPULayoutInterface)
+
+  StringRef getArgument() const final { return "test-xegpu-layout-interface"; }
+
+  StringRef getDescription() const final {
+    return "Test the implementation of XeGPU Layout interfaces";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+  }
+
+  TestXeGPULayoutInterface() = default;
+  TestXeGPULayoutInterface(const TestXeGPULayoutInterface &pass)
+      : PassWrapper(pass) {}
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    TypeConverter typeConverter;
+    auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                               mlir::ValueRange inputs,
+                               mlir::Location loc) -> mlir::Value {
+      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+          .getResult(0);
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<TestStepOpPattern>(typeConverter, ctx);
+
+    ConversionTarget target(*ctx);
+    auto isLegal = [&](xegpu::SliceAttr layout) -> bool {
+      return !layout || !layout.isWgLayout();
+    };
+
+    target.addDynamicallyLegalOp<vector::StepOp>(
+        [&](vector::StepOp op) -> bool {
+          auto layoutName = xegpu::getLayoutName(op->getResult(0));
+          auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
+          return isLegal(sliceAttr);
+        });
+
+    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+    (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+  }
+};
+
 } // namespace
 
 namespace mlir {
 namespace test {
 void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
+  PassRegistration<TestXeGPULayoutInterface>();
 }
 } // namespace test
 } // namespace mlir

>From e7f2977e79bca34b5bf6fabda74d95d4c934fd7e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 19:24:09 +0000
Subject: [PATCH 16/17] fix a typo

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fad3c6280fbbe..835da3a52885e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -280,7 +280,7 @@ LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
         });
 
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));
@@ -374,7 +374,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
         });
 
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, distUnit), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, shape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               builder.create<arith::ConstantIndexOp>(loc, std::get<1>(t)));

>From e3e4a618b65e7f6375d66d00d87ced9eac4b7629 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 25 Jul 2025 22:50:59 +0000
Subject: [PATCH 17/17] add unit test

---
 mlir/test/Dialect/XeGPU/layout.mlir | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index e5330951b065a..af13f69ab2d8a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -56,4 +56,15 @@ gpu.func @slice_attr_repeat_dim() {
   gpu.return
 }
 
+gpu.func @softmax_dim_0(%arg0: vector<256x128xf32>) -> vector<256x128xf32> {
+  %cst = arith.constant dense<0.000000e+00> : vector<128xf32>
+  %0 = math.exp %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+  //CHECK: vector.multi_reduction <add>, {{.*}} {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0] : vector<256x128xf32> to vector<128xf32>
+  //CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+  %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<128xf32> to vector<256x128xf32>
+  %3 = arith.divf %0, %2 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xf32>
+  gpu.return %3 : vector<256x128xf32>
+}
+
 }



More information about the Mlir-commits mailing list