[Mlir-commits] [mlir] [MLIR][XeGPU] Enhance XeGPU lane layout to support "wrap-around" distribution (PR #186958)

Thu Mar 19 15:08:40 PDT 2026

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/186958

>From 78033e901efab7bbc61d97c3ba712dbe0bbcd192 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 14 Mar 2026 21:13:31 +0000
Subject: [PATCH 1/7] add computeDistributeLayout and use them in two
 distribution process

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 57 ++++++++++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 87 +++++++++++++++++--
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 44 ++++++----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 56 +++++++-----
 mlir/test/Dialect/XeGPU/invalid.mlir          |  8 --
 mlir/test/Dialect/XeGPU/layout.mlir           | 10 +++
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 20 ++---
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 16 ++--
 8 files changed, 225 insertions(+), 73 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index ce0cce65373e5..7eba66d4485e2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -270,6 +270,63 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                     "FailureOr<SmallVector<SmallVector<Value>>>",
                     "computeDistributedCoords",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
+    InterfaceMethod<[{Computes the per-compute-unit shape by dividing each dimension of
+                      `shape` by the corresponding layout factor (sg_layout or
+                      lane_layout). For wrap-around dimensions where the division is uneven,
+                      the tensor tile is broadcasted to all subgroups/lanes.}],
+                    /*retTy=*/"FailureOr<SmallVector<int64_t>>",
+                    /*methodName=*/"computeDistributedShape",
+                    /*args=*/(ins "SmallVector<int64_t>":$shape),
+                    /*methodBody=*/[{
+                      SmallVector<int64_t> layout;
+                      SmallVector<int64_t> subShape;
+                      if ($_self.isForWorkgroup()) {
+                        layout = $_self.getEffectiveSgLayoutAsInt();
+                        subShape = $_self.getEffectiveSgDataAsInt();
+                      } else if ($_self.isForSubgroup()) {
+                        layout = $_self.getEffectiveLaneLayoutAsInt();
+                        subShape = $_self.getEffectiveLaneDataAsInt();
+                      } else {
+                        return failure();
+                      }
+                      assert(
+                          !subShape.empty() &&
+                          "sgdata or lanedata cannot be empty for distributed shape computation");
+
+                      SmallVector<int64_t> distributedShape(shape.size());
+                      llvm::errs() << "computeDistributedShape:\n";
+                      llvm::errs() << "  shape: [";
+                        llvm::interleaveComma(shape, llvm::errs());
+                        llvm::errs() << "]\n";
+                      llvm::errs() << "  layout: [";
+                        llvm::interleaveComma(layout, llvm::errs());
+                        llvm::errs() << "]\n";
+                      llvm::errs() << "  subShape: [";
+                        llvm::interleaveComma(subShape, llvm::errs());
+                        llvm::errs() << "]\n";
+                      for (auto [i, dim] : llvm::enumerate(shape)) {
+                        int64_t distri_unit = layout[i]*subShape[i];
+                        if ((dim % distri_unit) == 0) {
+                          // Evenly divisible case, divide the dimension by the layout factor.
+                          distributedShape[i] = dim / layout[i];
+                          assert((distributedShape[i] % subShape[i] == 0) &&
+                                "Even distribution: sgdata or lanedata must divide the distributed dimension");
+                          llvm::errs() << "  dim[" << i << "]=" << dim
+                            << " evenly divisible, distributed=" << distributedShape[i] << "\n";
+                        } else {
+                          // wrap around case, the dimension size must be equal to subShape value
+                          assert(dim == subShape[i] &&
+                                "Wrap-around distribution: sgdata or lanedata must be same as tensor tile shape");
+                          distributedShape[i] = dim;
+                          llvm::errs() << "  dim[" << i << "]=" << dim
+                            << " wrap-around, kept=" << distributedShape[i] << "\n";
+                        }
+                      }
+                      llvm::errs() << "  result: [";
+                        llvm::interleaveComma(distributedShape, llvm::errs());
+                        llvm::errs() << "]\n";
+                      return distributedShape;
+                    }]>,
     InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
                     /*retTy=*/"bool",
                     /*methodName=*/"isSliceOf",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa01aca7172..4402e0674208e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -160,7 +160,7 @@ bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
   // check LaneLayout and LaneData
   auto maybeLaneShape =
       tryDistribute(instShape, attr.getEffectiveLaneLayoutAsInt(),
-                    attr.getEffectiveLaneDataAsInt(), false);
+                    attr.getEffectiveLaneDataAsInt(), true);
   return maybeLaneShape.has_value();
 }
 
@@ -205,6 +205,83 @@ LogicalResult ScatterTensorDescAttr::verify(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XeGPU_DistributeLayoutAttr
+//===----------------------------------------------------------------------===//
+
+/// Computes the per-compute-unit shape by dividing each dimension of
+/// `shape` by the corresponding layout factor (sg_layout or
+/// lane_layout). For wrap-around dimensions where the division is uneven,
+/// the tensor tile is broadcasted to all subgroups/lanes.
+// FailureOr<SmallVector<int64_t>>
+// DistributeLayoutAttr::computeDistributedShape(SmallVector<int64_t> shape)
+// const {
+
+//   SmallVector<int64_t> layout;
+//   SmallVector<int64_t> subShape;
+//   if (isForWorkgroup()) {
+//     layout = getEffectiveSgLayoutAsInt();
+//     subShape = getEffectiveSgDataAsInt();
+//   } else if (isForSubgroup()) {
+//     layout = getEffectiveLaneLayoutAsInt();
+//     subShape = getEffectiveLaneDataAsInt();
+//   } else {
+//     return failure();
+//   }
+//   assert(
+//       !subShape.empty() &&
+//       "sgdata or lanedata cannot be empty for distributed shape
+//       computation");
+
+//   SmallVector<int64_t> distributedShape(shape);
+//   for (auto [i, dim] : llvm::enumerate(shape)) {
+//     if (dim % layout[i] != 0) {
+//       // wrap around case, the dimension size must be equal to subShape value
+//       assert(dim == subShape[i] &&
+//              "Wrap-around distribution: sgdata or lanedata must be same as "
+//              "tensor tile shape");
+//       distributedShape[i] = dim;
+//     } else {
+//       // Evenly divisible case, divide the dimension by the layout factor.
+//       distributedShape[i] = dim / layout[i];
+//       assert(distributedShape[i] % subShape[i] == 0 &&
+//              "Even distribution: sgdata or lanedata must divide the "
+//              "distributed dimension");
+//     }
+//   }
+//   return distributedShape;
+// }
+
+// bool DistributeLayoutAttr::isCompatibleWith(
+//     const xegpu::DistributeLayoutAttr &other, xegpu::LayoutKind level) {
+//   if (!other)
+//     return false;
+//   switch (level) {
+//   case xegpu::LayoutKind::Subgroup:
+//     if (getEffectiveSgLayoutAsInt() ==
+//                other.getEffectiveSgLayoutAsInt() &&
+//            getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt() &&
+//           getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
+//       return true;
+//     }
+//   case xegpu::LayoutKind::InstData:
+//     if (getEffectiveInstDataAsInt() ==
+//            other.getEffectiveInstDataAsInt()) {
+//       return true;
+//     }
+//   case xegpu::LayoutKind::Lane:
+//     if (getEffectiveLaneLayoutAsInt() ==
+//                other.getEffectiveLaneLayoutAsInt() &&
+//            getEffectiveLaneDataAsInt() ==
+//                other.getEffectiveLaneDataAsInt() &&
+//                getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
+//       return true;
+//     }
+//   }
+
+//   return false;
+// }
+
 //===----------------------------------------------------------------------===//
 // XeGPU_LayoutAttr
 //===----------------------------------------------------------------------===//
@@ -373,12 +450,8 @@ LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
   } else {
     return failure();
   }
-  if (subShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, layout))
-      subShape = derivedShape.value();
-    else
-      return failure();
-  }
+  assert(!subShape.empty() && "sgdata or lanedata cannot be empty for "
+                              "distributed coordinates computation");
 
   // delinearize Ids
   auto maybeIds = delinearizeId(builder, loc, linearId);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9f8dbc15f6422..c9998093171c6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -52,21 +52,25 @@ getSgShapeAndCount(ArrayRef<int64_t> shape,
                    xegpu::DistributeLayoutAttr layout) {
   int count = 1;
   SmallVector<int64_t> sgShape(shape);
-  if (layout && layout.isForWorkgroup()) {
-    SmallVector<int64_t> sgLayout = layout.getEffectiveSgLayoutAsInt();
-    if (!layout.getEffectiveSgDataAsInt().empty())
-      sgShape = layout.getEffectiveSgDataAsInt();
-    else if (auto maybeDerivedSgData = computeShapeRatio(shape, sgLayout))
-      sgShape = *maybeDerivedSgData;
-    SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, sgShape);
-    // Clamp distUnit to the original shape to handle cases where data is
-    // shared among subgroups, which may cause distUnit to exceed the original
-    // shape.
-    for (size_t i = 0; i < distUnit.size(); ++i)
-      distUnit[i] = std::min(shape[i], distUnit[i]);
-    count = computeProduct(shape) / computeProduct(distUnit);
-  }
-  return std::make_pair(sgShape, count);
+  auto distributedShape = layout.computeDistributedShape(
+      SmallVector<int64_t>(shape.begin(), shape.end()));
+  if (failed(distributedShape))
+    return std::make_pair(sgShape, count);
+  auto sgData = layout.getEffectiveSgDataAsInt();
+  count = computeProduct(distributedShape.value()) / computeProduct(sgData);
+  // auto sgLayout = layout.getEffectiveSgLayoutAsInt();
+  // auto sgData = layout.getEffectiveSgDataAsInt();
+  // SmallVector<int64_t> distUnit =
+  //     computeElementwiseMul(sgLayout, sgData);
+  // // Clamp distUnit to the original shape to handle wrap-around cases where
+  // data
+  // // is shared among subgroups, which may cause distUnit to exceed the
+  // original
+  // // shape.
+  // for (size_t i = 0; i < distUnit.size(); ++i)
+  //   distUnit[i] = std::min(shape[i], distUnit[i]);
+  // count = computeProduct(shape) / computeProduct(distUnit);
+  return std::make_pair(sgData, count);
 }
 
 /// Utility helper for deriving a list of offsets for each sub-TensorDescs
@@ -1712,16 +1716,20 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   converter.addConversion(
       [&](xegpu::TensorDescType type,
           SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+        xegpu::LayoutAttr layout = type.getLayoutAttr();
+        // Only convert WG-level tensor descs. SG-level or layout-less types
+        // are already legal and should pass through unchanged.
+        if (!layout || !layout.isForWorkgroup())
+          return std::nullopt;
+
         Type elemTy = type.getElementType();
         ArrayRef<int64_t> shape = type.getShape();
 
         int count;
         SmallVector<int64_t> subShape;
-        xegpu::LayoutAttr layout = type.getLayoutAttr();
         std::tie(subShape, count) = getSgShapeAndCount(shape, layout);
 
-        if (layout)
-          layout = layout.dropSgLayoutAndData();
+        layout = layout.dropSgLayoutAndData();
 
         auto newTy = xegpu::TensorDescType::get(
             type.getContext(), subShape, elemTy, type.getEncoding(), layout);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a57bf8512ddec..9884a46a7d5b8 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -103,33 +103,45 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
+// FailureOr<VectorType>
+// xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
+//                                        VectorType originalType) {
+//   if (!layout)
+//     return failure();
+//   assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+//          "Expecting a valid layout.");
+//   SmallVector<int64_t> effectiveLaneLayout =
+//       layout.getEffectiveLaneLayoutAsInt();
+//   assert(static_cast<size_t>(originalType.getRank()) >=
+//              effectiveLaneLayout.size() &&
+//          "Rank of the original vector type should be greater or equal to the
+//          " "size of the lane layout to distribute the vector type.");
+//   SmallVector<int64_t> distributedShape(originalType.getShape());
+//   // Only distribute the last `laneLayout.size()` dimensions. The remaining
+//   // dimensions are not distributed.
+//   unsigned distributionStart =
+//       originalType.getRank() - effectiveLaneLayout.size();
+//   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+//     if (i < distributionStart)
+//       continue;
+//     // Check if the dimension can be distributed evenly.
+//     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+//       return failure();
+//     distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+//   }
+//   return VectorType::get(distributedShape, originalType.getElementType());
+// }
+
 FailureOr<VectorType>
 xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                        VectorType originalType) {
   if (!layout)
     return failure();
-  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
-         "Expecting a valid layout.");
-  SmallVector<int64_t> effectiveLaneLayout =
-      layout.getEffectiveLaneLayoutAsInt();
-  assert(static_cast<size_t>(originalType.getRank()) >=
-             effectiveLaneLayout.size() &&
-         "Rank of the original vector type should be greater or equal to the "
-         "size of the lane layout to distribute the vector type.");
-  SmallVector<int64_t> distributedShape(originalType.getShape());
-  // Only distribute the last `laneLayout.size()` dimensions. The remaining
-  // dimensions are not distributed.
-  unsigned distributionStart =
-      originalType.getRank() - effectiveLaneLayout.size();
-  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart)
-      continue;
-    // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-      return failure();
-    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
-  }
-  return VectorType::get(distributedShape, originalType.getElementType());
+  auto distributedShape = layout.computeDistributedShape(
+      SmallVector<int64_t>(originalType.getShape()));
+  if (failed(distributedShape))
+    return failure();
+  return VectorType::get(*distributedShape, originalType.getElementType());
 }
 
 std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 53d497e4c2087..74c505e6dc6be 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -657,14 +657,6 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
   return
 }
 
-// -----
-func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
-  %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-      // expected-error at +1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>}}
-      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
-  return
-}
-
 // -----
 func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
   %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index e4b4e22e5cf97..29670d0b5aadd 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -27,6 +27,16 @@ gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
   gpu.return
 }
 
+
+// -----
+// CHECK: func.func @create_nd_tdesc_wrap_around_layout(%[[arg0:.*]]: memref<24x32xf32>) {
+func.func @create_nd_tdesc_wrap_around_layout(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
+    %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+      !xegpu.tensor_desc<4x8xf32,  #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
+  return
+}
+
 // CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 9e0ae881c8a7e..ecc5fe3dd75e0 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -104,28 +104,28 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: dpas_no_sg_data
   gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
-    %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+    %load_a =  xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>}
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
-    %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
-      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+    %load_b =  xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
+      : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>,
-       layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+       layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>,
-       layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16],  lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 61ca028dd3ea1..df3fa880c9d6d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -107,22 +107,22 @@ gpu.module @test_1_1_assignment {
   // CHECK-LABEL: dpas_no_sg_data
   gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
-    %load_a =  xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+    %load_a =  xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
     %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
-      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+      -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
-    %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+    %load_b =  xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>,
-       layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>,
-       layout_cd =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+      {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>,
+       layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>,
+       layout_cd =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }

>From b072030f9eb76a23aa4cffd293e0b505b50c5798 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sun, 15 Mar 2026 05:54:44 +0000
Subject: [PATCH 2/7] saving work

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  56 ++-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 342 ++++++++++++------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |   4 +-
 .../Transforms/XeGPUWgToSgDistribute.cpp      |   2 +-
 4 files changed, 279 insertions(+), 125 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7eba66d4485e2..04b058a91060f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -270,6 +270,15 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                     "FailureOr<SmallVector<SmallVector<Value>>>",
                     "computeDistributedCoords",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
+    InterfaceMethod<[{Statically computes multidimensional coordinates for all dist units
+                      assigned to a compute unit identified by `linearId`. This is the
+                      compile-time counterpart of `computeDistributedCoords`: it performs
+                      the same delinearization and round-robin enumeration but operates
+                      entirely on static integer values. Returns a list of coordinate
+                      vectors, one per dist unit.}],
+                    /*retTy=*/"SmallVector<SmallVector<int64_t>>",
+                    /*methodName=*/"computeStaticDistributedCoords",
+                    /*args=*/(ins "int64_t":$linearId, "ArrayRef<int64_t>":$shape)>,
     InterfaceMethod<[{Computes the per-compute-unit shape by dividing each dimension of
                       `shape` by the corresponding layout factor (sg_layout or
                       lane_layout). For wrap-around dimensions where the division is uneven,
@@ -339,28 +348,12 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                                   "ArrayRef<int64_t>": $perm,
                                   "xegpu::LayoutKind": $kind)>,
     InterfaceMethod</*desc=*/[{Check if this layout is compatible with another layout
-                     at a specific level of the layout hierarchy. Unlike isEqualTo,
-                     this compares only the effective (non-sliced) fields at the
-                     requested level.}],
+                     at a specific level of the layout hierarchy regarding a given shape. }],
                     /*retTy=*/"bool",
                     /*methodName=*/"isCompatibleWith",
                     /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other,
-                                  "xegpu::LayoutKind": $level),
-                    /*methodBody=*/[{
-                      if (!other)
-                        return false;
-                      switch (level) {
-                        case xegpu::LayoutKind::Subgroup:
-                          return $_self.getEffectiveSgLayoutAsInt() == other.getEffectiveSgLayoutAsInt() &&
-                                 $_self.getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt();
-                        case xegpu::LayoutKind::InstData:
-                          return $_self.getEffectiveInstDataAsInt() == other.getEffectiveInstDataAsInt();
-                        case xegpu::LayoutKind::Lane:
-                          return $_self.getEffectiveLaneLayoutAsInt() == other.getEffectiveLaneLayoutAsInt() &&
-                                 $_self.getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt();
-                      }
-                      return false;
-                    }]>,
+                                  "SmallVector<int64_t>: %shape",
+                                  "xegpu::LayoutKind": $level)>,
     InterfaceMethod</*desc=*/[{Check if this layout is equal to another layout.
                      For LayoutAttr, this compares all fields.
                      For SliceAttr, this requires the same parent and same sliced dims.}],
@@ -616,12 +609,24 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
     FailureOr<SmallVector<SmallVector<Value>>>
     computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
+    ///Statically computes multidimensional coordinates for all dist units
+    ///assigned to a compute unit identified by `linearId`. This is the
+    ///compile-time counterpart of `computeDistributedCoords`.
+    SmallVector<SmallVector<int64_t>>
+    computeStaticDistributedCoords(int64_t linearId, ArrayRef<int64_t> shape);
+
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
 
     /// Check if this layout is equal to another layout.
     bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
 
+    /// Check if this layout is compatible with another layout 
+    /// at a specific level of the layout hierarchy regarding a given shape.
+    bool isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
+                                  SmallVector<int64_t> shape,
+                                  xegpu::LayoutKind level);
+
     /// Check if this layout is a transpose of another layout.
     bool isTransposeOf(const xegpu::DistributeLayoutAttr &other, ArrayRef<int64_t> perm, const xegpu::LayoutKind kind);
   }];
@@ -829,16 +834,27 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     /// assigned to a subgroup identified by linearId. The shape parameter
     /// represents the workgroup-level problem size. Each subgroup may access
     /// multiple blocks according to round-robin distribution rules.
-
     FailureOr<SmallVector<SmallVector<Value>>>
     computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
+    ///Statically computes multidimensional coordinates for all dist units
+    ///assigned to a compute unit identified by `linearId`. This is the
+    ///compile-time counterpart of `computeDistributedCoords`.
+    SmallVector<SmallVector<int64_t>>
+    computeStaticDistributedCoords(int64_t linearId, ArrayRef<int64_t> shape);
+
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
 
     /// Check if this layout is equal to another layout.
     bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
 
+    /// Check if this layout is compatible with another layout 
+    /// at a specific level of the layout hierarchy regarding a given shape.
+    bool isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
+                                  SmallVector<int64_t> shape,
+                                  xegpu::LayoutKind level);
+
     /// Check if this layout is a transpose of another layout.
     bool isTransposeOf(const xegpu::DistributeLayoutAttr &other, ArrayRef<int64_t> perm, const xegpu::LayoutKind kind);
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4402e0674208e..bd221c03757f8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -205,83 +205,6 @@ LogicalResult ScatterTensorDescAttr::verify(
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// XeGPU_DistributeLayoutAttr
-//===----------------------------------------------------------------------===//
-
-/// Computes the per-compute-unit shape by dividing each dimension of
-/// `shape` by the corresponding layout factor (sg_layout or
-/// lane_layout). For wrap-around dimensions where the division is uneven,
-/// the tensor tile is broadcasted to all subgroups/lanes.
-// FailureOr<SmallVector<int64_t>>
-// DistributeLayoutAttr::computeDistributedShape(SmallVector<int64_t> shape)
-// const {
-
-//   SmallVector<int64_t> layout;
-//   SmallVector<int64_t> subShape;
-//   if (isForWorkgroup()) {
-//     layout = getEffectiveSgLayoutAsInt();
-//     subShape = getEffectiveSgDataAsInt();
-//   } else if (isForSubgroup()) {
-//     layout = getEffectiveLaneLayoutAsInt();
-//     subShape = getEffectiveLaneDataAsInt();
-//   } else {
-//     return failure();
-//   }
-//   assert(
-//       !subShape.empty() &&
-//       "sgdata or lanedata cannot be empty for distributed shape
-//       computation");
-
-//   SmallVector<int64_t> distributedShape(shape);
-//   for (auto [i, dim] : llvm::enumerate(shape)) {
-//     if (dim % layout[i] != 0) {
-//       // wrap around case, the dimension size must be equal to subShape value
-//       assert(dim == subShape[i] &&
-//              "Wrap-around distribution: sgdata or lanedata must be same as "
-//              "tensor tile shape");
-//       distributedShape[i] = dim;
-//     } else {
-//       // Evenly divisible case, divide the dimension by the layout factor.
-//       distributedShape[i] = dim / layout[i];
-//       assert(distributedShape[i] % subShape[i] == 0 &&
-//              "Even distribution: sgdata or lanedata must divide the "
-//              "distributed dimension");
-//     }
-//   }
-//   return distributedShape;
-// }
-
-// bool DistributeLayoutAttr::isCompatibleWith(
-//     const xegpu::DistributeLayoutAttr &other, xegpu::LayoutKind level) {
-//   if (!other)
-//     return false;
-//   switch (level) {
-//   case xegpu::LayoutKind::Subgroup:
-//     if (getEffectiveSgLayoutAsInt() ==
-//                other.getEffectiveSgLayoutAsInt() &&
-//            getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt() &&
-//           getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
-//       return true;
-//     }
-//   case xegpu::LayoutKind::InstData:
-//     if (getEffectiveInstDataAsInt() ==
-//            other.getEffectiveInstDataAsInt()) {
-//       return true;
-//     }
-//   case xegpu::LayoutKind::Lane:
-//     if (getEffectiveLaneLayoutAsInt() ==
-//                other.getEffectiveLaneLayoutAsInt() &&
-//            getEffectiveLaneDataAsInt() ==
-//                other.getEffectiveLaneDataAsInt() &&
-//                getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
-//       return true;
-//     }
-//   }
-
-//   return false;
-// }
-
 //===----------------------------------------------------------------------===//
 // XeGPU_LayoutAttr
 //===----------------------------------------------------------------------===//
@@ -315,25 +238,17 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        << lane_layout.size();
   }
 
-  // sg_data is optional for Workgroup layout, but its presence requires
-  // sg_layout.
-  if (sg_data) {
-    if (!sg_layout)
-      return emitError() << "expected sg_layout being used with sg_data";
-    if (sg_data.size() != sg_layout.size())
-      return emitError()
-             << "expected sg_data and sg_layout to have the same rank";
-  }
+  if ((sg_data && !sg_layout) || (!sg_data && sg_layout))
+    return emitError() << "sg_layout and sg_data must be used together";
+  if (sg_data.size() != sg_layout.size())
+    return emitError()
+           << "expected sg_data and sg_layout to have the same rank";
 
-  // lane_data is optional for Subgroup layout, but its presence requires
-  // lane_layout.
-  if (lane_data) {
-    if (!lane_layout)
-      return emitError() << "expected lane_layout being used with lane_data";
-    if (lane_data.size() != lane_layout.size())
-      return emitError()
-             << "expected lane_data and lane_layout to have the same rank";
-  }
+  if ((lane_data && !lane_layout) || (!lane_data && lane_layout))
+    return emitError() << "lane_layout and lane_data must be used together";
+  if (lane_data.size() != lane_layout.size())
+    return emitError()
+           << "expected lane_data and lane_layout to have the same rank";
 
   if (order) {
     if (!sg_layout && !lane_layout)
@@ -469,6 +384,70 @@ bool LayoutAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
   return *this == dyn_cast<xegpu::LayoutAttr>(other);
 }
 
+/// Implements DistributeLayoutAttr::computeStaticDistributedCoords to
+/// compute multi-dimensional offsets for a given linear ID when distributed by
+/// LayoutAttr.
+SmallVector<SmallVector<int64_t>>
+LayoutAttr::computeStaticDistributedCoords(int64_t linearId,
+                                           ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> layoutVec;
+  SmallVector<int64_t> subShape;
+  SmallVector<int64_t> instData;
+  if (isForWorkgroup()) {
+    layoutVec = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    instData = getEffectiveInstDataAsInt();
+    layoutVec = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  }
+  if (!instData.empty()) {
+    linearId = 0;
+    subShape = instData;
+  }
+  assert(!subShape.empty() && "sgdata or lanedata cannot be empty");
+
+  // Delinearize the linear ID using the order attribute.
+  DenseI32ArrayAttr orderAttr = getOrder();
+  SmallVector<int64_t> order;
+  if (orderAttr && !orderAttr.empty()) {
+    order = llvm::map_to_vector(orderAttr.asArrayRef(), [](int32_t idx) {
+      return static_cast<int64_t>(idx);
+    });
+  } else {
+    order =
+        llvm::to_vector(llvm::reverse(llvm::seq<int64_t>(0, layoutVec.size())));
+  }
+  SmallVector<int64_t> delinearizedId(layoutVec.size());
+  int64_t remaining = linearId;
+  for (size_t i = 0; i < order.size(); ++i) {
+    int64_t dimIdx = order[i];
+    delinearizedId[dimIdx] = remaining % layoutVec[dimIdx];
+    remaining = remaining / layoutVec[dimIdx];
+  }
+
+  // Compute distribution unit shape (clamped to srcShape).
+  SmallVector<int64_t> distUnitShape(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    distUnitShape[i] = std::min(shape[i], layoutVec[i] * subShape[i]);
+
+  // Compute local offset of this ID within a distribution unit.
+  SmallVector<int64_t> localOffset(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    localOffset[i] = delinearizedId[i] * subShape[i];
+
+  // Enumerate all distribution units and compute coordinates.
+  SmallVector<SmallVector<int64_t>> coordinates;
+  for (SmallVector<int64_t> unitOffs :
+       StaticTileOffsetRange(shape, distUnitShape)) {
+    SmallVector<int64_t> coord(shape.size());
+    for (size_t i = 0; i < shape.size(); ++i)
+      coord[i] = (unitOffs[i] + localOffset[i]) % shape[i];
+    coordinates.push_back(coord);
+  }
+  return coordinates;
+}
+
 // set the layout for unit dims: sg_data, inst_data and lane_data to 1
 DistributeLayoutAttr
 LayoutAttr::setUnitDimData(SmallVector<int64_t> unitDims) const {
@@ -816,6 +795,44 @@ bool LayoutAttr::isTransposeOf(const xegpu::DistributeLayoutAttr &other,
   return false;
 }
 
+bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
+                                  SmallVector<int64_t> shape,
+                                  xegpu::LayoutKind level) {
+  if (!other)
+    return false;
+  if (getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
+    if (level == xegpu::LayoutKind::Subgroup)
+      return (getEffectiveSgLayoutAsInt() ==
+                  other.getEffectiveSgLayoutAsInt() &&
+              getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt());
+    if (level == xegpu::LayoutKind::Lane)
+      return (getEffectiveLaneLayoutAsInt() == other.getEffectiveLaneLayoutAsInt() &&
+        getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt();
+  }
+  if (level == xegpu::LayoutKind::Subgroup) {
+    int64_t wgSize = computeProduct(getEffectiveSgLayoutAsInt());
+    for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
+      auto coords = computeStaticDistributedCoords(id, shape);
+      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
+      if (coords != otherCoords)
+        return false;
+    }
+  }
+  if (level == xegpu::LayoutKind::InstData) {
+    return (getEffectiveInstDataAsInt() == other.getEffectiveInstDataAsInt());
+  }
+  if (level == xegpu::LayoutKind::Lane) {
+    int64_t subgroupSize = computeProduct(getEffectiveLaneLayoutAsInt());
+    for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
+      auto coords = computeStaticDistributedCoords(id, shape);
+      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
+      if (coords != otherCoords)
+        return false;
+    }
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -892,12 +909,8 @@ SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
     return failure();
   }
 
-  if (subShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, layout))
-      subShape = derivedShape.value();
-    else
-      return failure();
-  }
+  if (subShape.empty())
+    return failure();
 
   // delinearize Ids
   auto maybeIds = delinearizeId(builder, loc, linearId);
@@ -907,10 +920,92 @@ SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
   // The effective sgIds for offsets computing correspond
   // to the dims that are not sliced.
   ArrayRef<int64_t> dims = flatten().getDims().asArrayRef();
-  SmallVector<Value> sgIds =
+  SmallVector<Value> canonicalIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  return genCoordinates(builder, loc, sgIds, layout, subShape, shape);
+  return genCoordinates(builder, loc, canonicalIds, layout, subShape, shape);
+}
+
+/// Implements DistributeLayoutAttr::computeStaticDistributedCoords to
+/// compute multi-dimensional offsets for a given linear ID when distributed by
+/// SliceAttr. Delegates delinearization to the parent LayoutAttr, then uses
+/// only the non-sliced dimensions for coordinate computation.
+SmallVector<SmallVector<int64_t>>
+SliceAttr::computeStaticDistributedCoords(int64_t linearId,
+                                          ArrayRef<int64_t> shape) {
+  assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
+
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    instData = getEffectiveInstDataAsInt();
+    layoutVec = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  }
+  if (!instData.empty()) {
+    linearId = 0;
+    subShape = instData;
+  }
+
+  assert(!subShape.empty() && "sgdata or lanedata cannot be empty");
+
+  // Delinearize the ID using the parent layout (same as the IR version).
+  SliceAttr flattened = flatten();
+  auto parent = dyn_cast<LayoutAttr>(flattened.getParent());
+  SmallVector<int64_t> parentLayoutVec;
+  if (parent.isForWorkgroup())
+    parentLayoutVec = parent.getEffectiveSgLayoutAsInt();
+  else
+    parentLayoutVec = parent.getEffectiveLaneLayoutAsInt();
+
+  DenseI32ArrayAttr orderAttr = parent.getOrder();
+  SmallVector<int64_t> order;
+  if (orderAttr && !orderAttr.empty()) {
+    order = llvm::map_to_vector(orderAttr.asArrayRef(), [](int32_t idx) {
+      return static_cast<int64_t>(idx);
+    });
+  } else {
+    order = llvm::to_vector(
+        llvm::reverse(llvm::seq<int64_t>(0, parentLayoutVec.size())));
+  }
+  SmallVector<int64_t> allIds(parentLayoutVec.size());
+  int64_t remaining = linearId;
+  for (size_t i = 0; i < order.size(); ++i) {
+    int64_t dimIdx = order[i];
+    allIds[dimIdx] = remaining % parentLayoutVec[dimIdx];
+    if (i < order.size() - 1)
+      remaining = remaining / parentLayoutVec[dimIdx];
+  }
+
+  // The effective IDs for coordinate computation correspond
+  // to the dims that are not sliced.
+  ArrayRef<int64_t> dims = flattened.getDims().asArrayRef();
+  SmallVector<int64_t> canonicalIds =
+      XeGPUDialect::slice(ArrayRef<int64_t>(allIds), dims);
+
+  // Compute distribution unit shape (clamped to srcShape).
+  SmallVector<int64_t> distUnitShape(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    distUnitShape[i] = std::min(shape[i], layout[i] * subShape[i]);
+
+  // Compute local offset of this ID within a distribution unit.
+  SmallVector<int64_t> localOffset(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    localOffset[i] = canonicalIds[i] * subShape[i];
+
+  // Enumerate all distribution units and compute coordinates.
+  SmallVector<SmallVector<int64_t>> coordinates;
+  for (SmallVector<int64_t> unitOffs :
+       StaticTileOffsetRange(shape, distUnitShape)) {
+    SmallVector<int64_t> coord(shape.size());
+    for (size_t i = 0; i < shape.size(); ++i)
+      coord[i] = (unitOffs[i] + localOffset[i]) % shape[i];
+    coordinates.push_back(coord);
+  }
+  return coordinates;
 }
 
 bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
@@ -944,6 +1039,47 @@ bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
           (flattenedThis.getDims() == flattenedOther.getDims()));
 }
 
+bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
+                                  SmallVector<int64_t> shape,
+                                  xegpu::LayoutKind level) {
+  if (!other)
+    return false;
+  if (getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
+    if (level == xegpu::LayoutKind::Subgroup)
+      return (getEffectiveSgLayoutAsInt() ==
+                  other.getEffectiveSgLayoutAsInt() &&
+              getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt());
+    if (level == xegpu::LayoutKind::Lane)
+      return (getEffectiveLaneLayoutAsInt() == other.getEffectiveLaneLayoutAsInt() &&
+        getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt();
+  }
+
+  auto flattenedThis = flatten();
+  auto parent = dyn_cast<LayoutAttr>(flattenedThis.getParent());
+  if (level == xegpu::LayoutKind::Subgroup) {
+    int64_t wgSize = computeProduct(parent.getEffectiveSgLayoutAsInt());
+    for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
+      auto coords = computeStaticDistributedCoords(id, shape);
+      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
+      if (coords != otherCoords)
+        return false;
+    }
+  }
+  if (level == xegpu::LayoutKind::InstData) {
+    return (getEffectiveInstDataAsInt() == other.getEffectiveInstDataAsInt());
+  }
+  if (level == xegpu::LayoutKind::Lane) {
+    int64_t subgroupSize = computeProduct(parent.getEffectiveLaneLayoutAsInt());
+    for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
+      auto coords = computeStaticDistributedCoords(id, shape);
+      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
+      if (coords != otherCoords)
+        return false;
+    }
+  }
+  return true;
+}
+
 xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> sliceDimsToDrop) {
   if (sliceDimsToDrop.empty())
     return *this;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 7e8ad733fa0ee..e56e443feae0f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -2080,11 +2080,13 @@ struct ConvertLayoutDistribution
                                 PatternRewriter &rewriter) const override {
     auto inputLayout = op.getInputLayoutAttr();
     auto targetLayout = op.getTargetLayoutAttr();
+    auto resShape = cast<VectorType>(op.getResult().getType()).getShape();
 
     if (!inputLayout || !targetLayout)
       return rewriter.notifyMatchFailure(op, "missing layout attributes");
 
-    if (!inputLayout.isCompatibleWith(targetLayout, xegpu::LayoutKind::Lane)) {
+    if (!inputLayout.isCompatibleWith(targetLayout, resShape,
+                                      xegpu::LayoutKind::Lane)) {
       return rewriter.notifyMatchFailure(
           op, "lowering incompatible convert_layout not yet supported");
     }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index c9998093171c6..9de5861d81fac 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -630,7 +630,7 @@ struct WgToSgConvertLayoutOp
     SmallVector<int64_t> targetSgData = targetLayout.getEffectiveSgDataAsInt();
 
     // Fast path: if sg_layout and sg_data are identical, no SLM needed
-    if (inputLayout.isCompatibleWith(targetLayout,
+    if (inputLayout.isCompatibleWith(targetLayout, wgShape,
                                      xegpu::LayoutKind::Subgroup)) {
       inputLayout = inputLayout.dropSgLayoutAndData();
       targetLayout = targetLayout.dropSgLayoutAndData();

>From edc26726254a8b02f471943bc32dec8825a567be Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 16 Mar 2026 01:17:06 +0000
Subject: [PATCH 3/7] enhance isCompatibleWith and computeStaticCoord
 utitlities

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  2 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 27 ++++++++++---------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  3 ++-
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  3 ++-
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 04b058a91060f..1c67244658d15 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -352,7 +352,7 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                     /*retTy=*/"bool",
                     /*methodName=*/"isCompatibleWith",
                     /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other,
-                                  "SmallVector<int64_t>: %shape",
+                                  "SmallVector<int64_t>": $shape,
                                   "xegpu::LayoutKind": $level)>,
     InterfaceMethod</*desc=*/[{Check if this layout is equal to another layout.
                      For LayoutAttr, this compares all fields.
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index bd221c03757f8..b0d34afe62e6a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -238,15 +238,15 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        << lane_layout.size();
   }
 
-  if ((sg_data && !sg_layout) || (!sg_data && sg_layout))
+  if ((sg_layout && !sg_data) || (!sg_layout && sg_data))
     return emitError() << "sg_layout and sg_data must be used together";
-  if (sg_data.size() != sg_layout.size())
+  if (sg_layout && sg_data && sg_layout.size() != sg_data.size())
     return emitError()
            << "expected sg_data and sg_layout to have the same rank";
 
-  if ((lane_data && !lane_layout) || (!lane_data && lane_layout))
+  if ((lane_layout && !lane_data) || (!lane_layout && lane_data))
     return emitError() << "lane_layout and lane_data must be used together";
-  if (lane_data.size() != lane_layout.size())
+  if (lane_layout && lane_data && lane_layout.size() != lane_data.size())
     return emitError()
            << "expected lane_data and lane_layout to have the same rank";
 
@@ -806,8 +806,9 @@ bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
                   other.getEffectiveSgLayoutAsInt() &&
               getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt());
     if (level == xegpu::LayoutKind::Lane)
-      return (getEffectiveLaneLayoutAsInt() == other.getEffectiveLaneLayoutAsInt() &&
-        getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt();
+      return (getEffectiveLaneLayoutAsInt() ==
+                  other.getEffectiveLaneLayoutAsInt() &&
+              getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt());
   }
   if (level == xegpu::LayoutKind::Subgroup) {
     int64_t wgSize = computeProduct(getEffectiveSgLayoutAsInt());
@@ -937,12 +938,13 @@ SliceAttr::computeStaticDistributedCoords(int64_t linearId,
 
   SmallVector<int64_t> layout;
   SmallVector<int64_t> subShape;
+  SmallVector<int64_t> instData;
   if (isForWorkgroup()) {
     layout = getEffectiveSgLayoutAsInt();
     subShape = getEffectiveSgDataAsInt();
   } else if (isForSubgroup()) {
     instData = getEffectiveInstDataAsInt();
-    layoutVec = getEffectiveLaneLayoutAsInt();
+    layout = getEffectiveLaneLayoutAsInt();
     subShape = getEffectiveLaneDataAsInt();
   }
   if (!instData.empty()) {
@@ -1039,9 +1041,9 @@ bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
           (flattenedThis.getDims() == flattenedOther.getDims()));
 }
 
-bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
-                                  SmallVector<int64_t> shape,
-                                  xegpu::LayoutKind level) {
+bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
+                                 SmallVector<int64_t> shape,
+                                 xegpu::LayoutKind level) {
   if (!other)
     return false;
   if (getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
@@ -1050,8 +1052,9 @@ bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
                   other.getEffectiveSgLayoutAsInt() &&
               getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt());
     if (level == xegpu::LayoutKind::Lane)
-      return (getEffectiveLaneLayoutAsInt() == other.getEffectiveLaneLayoutAsInt() &&
-        getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt();
+      return (getEffectiveLaneLayoutAsInt() ==
+                  other.getEffectiveLaneLayoutAsInt() &&
+              getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt());
   }
 
   auto flattenedThis = flatten();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index e56e443feae0f..6d6725617d306 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -2085,7 +2085,8 @@ struct ConvertLayoutDistribution
     if (!inputLayout || !targetLayout)
       return rewriter.notifyMatchFailure(op, "missing layout attributes");
 
-    if (!inputLayout.isCompatibleWith(targetLayout, resShape,
+    SmallVector<int64_t> resShapeVec(resShape.begin(), resShape.end());
+    if (!inputLayout.isCompatibleWith(targetLayout, resShapeVec,
                                       xegpu::LayoutKind::Lane)) {
       return rewriter.notifyMatchFailure(
           op, "lowering incompatible convert_layout not yet supported");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9de5861d81fac..687617b5150cb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -630,7 +630,8 @@ struct WgToSgConvertLayoutOp
     SmallVector<int64_t> targetSgData = targetLayout.getEffectiveSgDataAsInt();
 
     // Fast path: if sg_layout and sg_data are identical, no SLM needed
-    if (inputLayout.isCompatibleWith(targetLayout, wgShape,
+    SmallVector<int64_t> wgShapeVec(wgShape.begin(), wgShape.end());
+    if (inputLayout.isCompatibleWith(targetLayout, wgShapeVec,
                                      xegpu::LayoutKind::Subgroup)) {
       inputLayout = inputLayout.dropSgLayoutAndData();
       targetLayout = targetLayout.dropSgLayoutAndData();

>From 71f3a9efdecf9b755cd0f1c260faf06a93b852a4 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 17 Mar 2026 05:21:35 +0000
Subject: [PATCH 4/7] adding test and polish

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 32 +++++++--
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 13 ----
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 69 ++++++++++++++++++-
 mlir/test/Dialect/XeGPU/invalid.mlir          |  8 +--
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  9 ++-
 5 files changed, 103 insertions(+), 28 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index b0d34afe62e6a..ded262692ac75 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -1047,18 +1047,25 @@ bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
   if (!other)
     return false;
   if (getEffectiveOrderAsInt() == other.getEffectiveOrderAsInt()) {
+    // short cut when order is the same, no need to compute coords and compare
     if (level == xegpu::LayoutKind::Subgroup)
-      return (getEffectiveSgLayoutAsInt() ==
-                  other.getEffectiveSgLayoutAsInt() &&
-              getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt());
+      if (getEffectiveSgLayoutAsInt() == other.getEffectiveSgLayoutAsInt() &&
+          getEffectiveSgDataAsInt() == other.getEffectiveSgDataAsInt())
+        return true;
     if (level == xegpu::LayoutKind::Lane)
-      return (getEffectiveLaneLayoutAsInt() ==
-                  other.getEffectiveLaneLayoutAsInt() &&
-              getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt());
+      if (getEffectiveLaneLayoutAsInt() ==
+              other.getEffectiveLaneLayoutAsInt() &&
+          getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt())
+        return true;
   }
 
   auto flattenedThis = flatten();
   auto parent = dyn_cast<LayoutAttr>(flattenedThis.getParent());
+  // debug print parent
+  llvm::dbgs() << "Parent layout - sgLayout: ";
+  for (auto &l : parent.getEffectiveSgLayoutAsInt()) {
+    llvm::dbgs() << l << " ";
+  }
   if (level == xegpu::LayoutKind::Subgroup) {
     int64_t wgSize = computeProduct(parent.getEffectiveSgLayoutAsInt());
     for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
@@ -1076,6 +1083,19 @@ bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
     for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
       auto coords = computeStaticDistributedCoords(id, shape);
       auto otherCoords = other.computeStaticDistributedCoords(id, shape);
+      llvm::dbgs() << "Lane comparison - id: " << id << ", coords: ";
+      for (auto &c : coords) {
+        llvm::dbgs() << "[";
+        llvm::interleaveComma(c, llvm::dbgs());
+        llvm::dbgs() << "] ";
+      }
+      llvm::dbgs() << "otherCoords: ";
+      for (auto &c : otherCoords) {
+        llvm::dbgs() << "[";
+        llvm::interleaveComma(c, llvm::dbgs());
+        llvm::dbgs() << "] ";
+      }
+      llvm::dbgs() << "\n";
       if (coords != otherCoords)
         return false;
     }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 57db05f4a3b74..08773a0cd2791 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -677,19 +677,6 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
   auto srcShape = sourceTy.getShape();
   auto resShape = resultTy.getShape();
 
-  size_t dimDiff = resultTy.getRank() - sourceTy.getRank();
-  if (dimDiff == 0) {
-    bool hasUnitDim =
-        llvm::any_of(srcShape, [](int64_t dim) { return dim == 1; });
-    Operation *srcOp = broadcast.getSource().getDefiningOp();
-    if (!srcOp)
-      return;
-    bool produceByShapeCast = isa<vector::ShapeCastOp>(srcOp);
-    assert(
-        hasUnitDim && produceByShapeCast &&
-        "When broadcasting from unit-dim, the producer op must be shape_cast!");
-  }
-
   auto resultLayoutAttr =
       dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 6d6725617d306..f5cf7592b1dc5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1091,33 +1091,69 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(DBGS() << "LoadDistribution: attempting to match warpOp: "
+                      << warpOp << "\n");
+
+    Operation *lastNode = warpOp.getTerminator()->getPrevNode();
+    LLVM_DEBUG(DBGS() << "LoadDistribution: warpOp.getTerminator() = "
+                      << warpOp.getTerminator() << "\n");
+    LLVM_DEBUG(
+        DBGS() << "LoadDistribution: warpOp.getTerminator()->getPrevNode() = "
+               << lastNode << "\n");
+
+    auto loadGatherOp1 = dyn_cast_or_null<xegpu::LoadGatherOp>(lastNode);
+    // print loadGatherOp and its operands
+    if (loadGatherOp1) {
+      LLVM_DEBUG(DBGS() << "LoadDistribution: found LoadGatherOp: "
+                        << loadGatherOp1 << "\n");
+    } else {
+      LLVM_DEBUG(DBGS() << "LoadDistribution: no LoadGatherOp found as last op "
+                           "in warp region\n");
+    }
+
     OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
       // Check if the yield operand that was produced by the *last* scattered
       // load op to avoid sinking it before barriers (maintain memory order).
       return isa<xegpu::LoadGatherOp>(op) &&
              warpOp.getTerminator()->getPrevNode() == op;
     });
-    if (!producedByLastLoad)
+    if (!producedByLastLoad) {
+      LLVM_DEBUG(DBGS() << "LoadDistribution: no LoadGatherOp as last op in "
+                           "warp region, bailing out\n");
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadGatherOp");
+    }
 
     auto loadGatherOp =
         producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
+    LLVM_DEBUG(DBGS() << "LoadDistribution: found LoadGatherOp: "
+                      << loadGatherOp << "\n");
     auto offsets = loadGatherOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()) ||
-        !isa<VectorType>(loadGatherOp.getMask().getType()))
+        !isa<VectorType>(loadGatherOp.getMask().getType())) {
+      LLVM_DEBUG(DBGS() << "LoadDistribution: offsets or mask are not vector "
+                           "types, bailing out\n");
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Load op must have a vector arguments for offsets and mask");
+    }
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
     VectorType resultVecTy =
         cast<VectorType>(loadGatherOp.getResult().getType());
+    LLVM_DEBUG(DBGS() << "LoadDistribution: offsetsTy=" << offsetsTy
+                      << ", maskTy=" << maskTy
+                      << ", resultVecTy=" << resultVecTy << "\n");
     // add handling leading unit dimensions support
     int chunkSize = loadGatherOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+    LLVM_DEBUG(DBGS() << "LoadDistribution: chunkSize=" << chunkSize
+                      << ", effectiveVecRank=" << effectiveVecRank << "\n");
     for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
       if (resultVecTy.getShape()[i] != 1) {
+        LLVM_DEBUG(DBGS() << "LoadDistribution: non-unit leading dim at index "
+                          << i << " (size=" << resultVecTy.getShape()[i]
+                          << "), bailing out\n");
         return rewriter.notifyMatchFailure(
             loadGatherOp, "Only unit dimensions allowed for the leading "
                           "dimensions of the load vector!");
@@ -1127,6 +1163,8 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
     auto layoutOffsets =
         xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
     auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
+    LLVM_DEBUG(DBGS() << "LoadDistribution: layoutOffsets=" << layoutOffsets
+                      << ", layoutMask=" << layoutMask << "\n");
 
     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
@@ -1134,6 +1172,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
+      LLVM_DEBUG(
+          DBGS() << "LoadDistribution: failed to compute distributed types "
+                    "for offsets or mask, bailing out\n");
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Some vector operands have no layouts, using defaults instead.");
@@ -1147,12 +1188,24 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
+    LLVM_DEBUG(DBGS() << "LoadDistribution: operandIdx=" << operandIdx
+                      << ", distResultTy=" << distResultTy
+                      << ", distOffsetsTy=" << distOffsetsTy
+                      << ", distMaskTy=" << distMaskTy << "\n");
 
     SmallVector<Type> operandTypesToYield = {operands[0].getType(),
                                              distOffsetsTy, distMaskTy};
 
+    LLVM_DEBUG(DBGS() << "LoadDistribution: creating new warp op with "
+                      << operands.size() << " operands and "
+                      << operandTypesToYield.size() << " types to yield\n");
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
+    LLVM_DEBUG({
+      DBGS() << "LoadDistribution: newRetIndices=[";
+      llvm::interleaveComma(newRetIndices, llvm::dbgs());
+      llvm::dbgs() << "]\n";
+    });
 
     rewriter.setInsertionPointAfter(newWarpOp);
 
@@ -1166,24 +1219,36 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
     VectorType distMaskTy1D =
         VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
                         distMaskByWarpOpOrFailure.value().getElementType());
+    LLVM_DEBUG(DBGS() << "LoadDistribution: 1D types: loadVecTy1D="
+                      << loadVecTy1D << ", distOffsetsTy1D=" << distOffsetsTy1D
+                      << ", distMaskTy1D=" << distMaskTy1D << "\n");
 
     Value distOffsetVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
     Value distmaskVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
+    LLVM_DEBUG(DBGS() << "LoadDistribution: resolved distOffsetVal="
+                      << distOffsetVal << ", distmaskVal=" << distmaskVal
+                      << "\n");
 
     SmallVector<Value> newLoadGatherOperands = {
         newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
 
+    LLVM_DEBUG(DBGS() << "LoadDistribution: creating distributed "
+                         "LoadGatherOp with result type "
+                      << loadVecTy1D << "\n");
     xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
         rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
         loadGatherOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
+    LLVM_DEBUG(DBGS() << "LoadDistribution: created new op: " << newOp << "\n");
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type and replace all uses.
     rewriter.replaceAllUsesWith(
         distributedVal,
         resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
+    LLVM_DEBUG(DBGS() << "LoadDistribution: successfully distributed "
+                         "LoadGatherOp\n");
     return success();
   }
 };
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 74c505e6dc6be..7390b47b3f8d9 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -758,8 +758,8 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
   %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        // expected-error at +1 {{expected sg_layout being used with sg_data}}
-        #xegpu.layout<sg_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2]>>
+        // expected-error at +1 {{sg_layout and sg_data must be used together}}
+        #xegpu.layout<sg_layout = [2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
   return
 }
 
@@ -768,8 +768,8 @@ func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
   %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
       !xegpu.tensor_desc<16x2xf32,
         #xegpu.scatter_tdesc_attr<chunk_size = 2>,
-        // expected-error at +1 {{expected lane_layout being used with lane_data}}
-        #xegpu.layout<inst_data = [16, 2], lane_data = [1, 2]>>
+        // expected-error at +1 {{lane_layout and lane_data must be used together}}
+        #xegpu.layout<inst_data = [16, 2], lane_layout = [16, 1]>>
   return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index ddd2d22108d1f..2dfade7369e53 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -739,7 +739,9 @@ gpu.module @test {
 // CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf16> to vector<16xf16>
 // CHECK-NEXT:    %[[SHAPECAST:.*]] = vector.shape_cast %[[REDUCE]]
 // CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
-// CHECK-NEXT:    vector.broadcast %[[SHAPECAST]]
+// CHECK-NEXT:    %[[EXP:.*]]  = math.exp %[[SHAPECAST]]
+// CHECK-SAME        {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16>
+// CHECK-NEXT:    vector.broadcast %[[EXP]]
 // CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
 
 func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
@@ -748,8 +750,9 @@ func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x1
   %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
   %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
-  %6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
-  xegpu.store_nd %6, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  %6 = math.exp %5: vector<16x1xf16>
+  %7 = vector.broadcast %6 : vector<16x1xf16> to vector<16x16xf16>
+  xegpu.store_nd %7, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
 }

>From c9ac053d1a95cc17cefe0dc81143684c654bdabe Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 17 Mar 2026 05:41:25 +0000
Subject: [PATCH 5/7] remove debug print

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 18 -----
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 18 -----
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 66 +------------------
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 12 ----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 29 --------
 5 files changed, 1 insertion(+), 142 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 1c67244658d15..7917a0dde556c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -301,18 +301,7 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       assert(
                           !subShape.empty() &&
                           "sgdata or lanedata cannot be empty for distributed shape computation");
-
                       SmallVector<int64_t> distributedShape(shape.size());
-                      llvm::errs() << "computeDistributedShape:\n";
-                      llvm::errs() << "  shape: [";
-                        llvm::interleaveComma(shape, llvm::errs());
-                        llvm::errs() << "]\n";
-                      llvm::errs() << "  layout: [";
-                        llvm::interleaveComma(layout, llvm::errs());
-                        llvm::errs() << "]\n";
-                      llvm::errs() << "  subShape: [";
-                        llvm::interleaveComma(subShape, llvm::errs());
-                        llvm::errs() << "]\n";
                       for (auto [i, dim] : llvm::enumerate(shape)) {
                         int64_t distri_unit = layout[i]*subShape[i];
                         if ((dim % distri_unit) == 0) {
@@ -320,20 +309,13 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                           distributedShape[i] = dim / layout[i];
                           assert((distributedShape[i] % subShape[i] == 0) &&
                                 "Even distribution: sgdata or lanedata must divide the distributed dimension");
-                          llvm::errs() << "  dim[" << i << "]=" << dim
-                            << " evenly divisible, distributed=" << distributedShape[i] << "\n";
                         } else {
                           // wrap around case, the dimension size must be equal to subShape value
                           assert(dim == subShape[i] &&
                                 "Wrap-around distribution: sgdata or lanedata must be same as tensor tile shape");
                           distributedShape[i] = dim;
-                          llvm::errs() << "  dim[" << i << "]=" << dim
-                            << " wrap-around, kept=" << distributedShape[i] << "\n";
                         }
                       }
-                      llvm::errs() << "  result: [";
-                        llvm::interleaveComma(distributedShape, llvm::errs());
-                        llvm::errs() << "]\n";
                       return distributedShape;
                     }]>,
     InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ded262692ac75..0fbd339146d7b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -1061,11 +1061,6 @@ bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
 
   auto flattenedThis = flatten();
   auto parent = dyn_cast<LayoutAttr>(flattenedThis.getParent());
-  // debug print parent
-  llvm::dbgs() << "Parent layout - sgLayout: ";
-  for (auto &l : parent.getEffectiveSgLayoutAsInt()) {
-    llvm::dbgs() << l << " ";
-  }
   if (level == xegpu::LayoutKind::Subgroup) {
     int64_t wgSize = computeProduct(parent.getEffectiveSgLayoutAsInt());
     for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
@@ -1083,19 +1078,6 @@ bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
     for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
       auto coords = computeStaticDistributedCoords(id, shape);
       auto otherCoords = other.computeStaticDistributedCoords(id, shape);
-      llvm::dbgs() << "Lane comparison - id: " << id << ", coords: ";
-      for (auto &c : coords) {
-        llvm::dbgs() << "[";
-        llvm::interleaveComma(c, llvm::dbgs());
-        llvm::dbgs() << "] ";
-      }
-      llvm::dbgs() << "otherCoords: ";
-      for (auto &c : otherCoords) {
-        llvm::dbgs() << "[";
-        llvm::interleaveComma(c, llvm::dbgs());
-        llvm::dbgs() << "] ";
-      }
-      llvm::dbgs() << "\n";
       if (coords != otherCoords)
         return false;
     }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f5cf7592b1dc5..4ccdf000849b1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1091,26 +1091,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    LLVM_DEBUG(DBGS() << "LoadDistribution: attempting to match warpOp: "
-                      << warpOp << "\n");
-
-    Operation *lastNode = warpOp.getTerminator()->getPrevNode();
-    LLVM_DEBUG(DBGS() << "LoadDistribution: warpOp.getTerminator() = "
-                      << warpOp.getTerminator() << "\n");
-    LLVM_DEBUG(
-        DBGS() << "LoadDistribution: warpOp.getTerminator()->getPrevNode() = "
-               << lastNode << "\n");
-
-    auto loadGatherOp1 = dyn_cast_or_null<xegpu::LoadGatherOp>(lastNode);
-    // print loadGatherOp and its operands
-    if (loadGatherOp1) {
-      LLVM_DEBUG(DBGS() << "LoadDistribution: found LoadGatherOp: "
-                        << loadGatherOp1 << "\n");
-    } else {
-      LLVM_DEBUG(DBGS() << "LoadDistribution: no LoadGatherOp found as last op "
-                           "in warp region\n");
-    }
-
     OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
       // Check if the yield operand that was produced by the *last* scattered
       // load op to avoid sinking it before barriers (maintain memory order).
@@ -1118,42 +1098,27 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
              warpOp.getTerminator()->getPrevNode() == op;
     });
     if (!producedByLastLoad) {
-      LLVM_DEBUG(DBGS() << "LoadDistribution: no LoadGatherOp as last op in "
-                           "warp region, bailing out\n");
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadGatherOp");
     }
 
     auto loadGatherOp =
         producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
-    LLVM_DEBUG(DBGS() << "LoadDistribution: found LoadGatherOp: "
-                      << loadGatherOp << "\n");
     auto offsets = loadGatherOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()) ||
-        !isa<VectorType>(loadGatherOp.getMask().getType())) {
-      LLVM_DEBUG(DBGS() << "LoadDistribution: offsets or mask are not vector "
-                           "types, bailing out\n");
+        !isa<VectorType>(loadGatherOp.getMask().getType()))
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Load op must have a vector arguments for offsets and mask");
-    }
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
     VectorType resultVecTy =
         cast<VectorType>(loadGatherOp.getResult().getType());
-    LLVM_DEBUG(DBGS() << "LoadDistribution: offsetsTy=" << offsetsTy
-                      << ", maskTy=" << maskTy
-                      << ", resultVecTy=" << resultVecTy << "\n");
     // add handling leading unit dimensions support
     int chunkSize = loadGatherOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
-    LLVM_DEBUG(DBGS() << "LoadDistribution: chunkSize=" << chunkSize
-                      << ", effectiveVecRank=" << effectiveVecRank << "\n");
     for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
       if (resultVecTy.getShape()[i] != 1) {
-        LLVM_DEBUG(DBGS() << "LoadDistribution: non-unit leading dim at index "
-                          << i << " (size=" << resultVecTy.getShape()[i]
-                          << "), bailing out\n");
         return rewriter.notifyMatchFailure(
             loadGatherOp, "Only unit dimensions allowed for the leading "
                           "dimensions of the load vector!");
@@ -1163,8 +1128,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
     auto layoutOffsets =
         xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
     auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
-    LLVM_DEBUG(DBGS() << "LoadDistribution: layoutOffsets=" << layoutOffsets
-                      << ", layoutMask=" << layoutMask << "\n");
 
     FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
@@ -1172,9 +1135,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
     if (failed(distOffsetsByWarpOpOrFailure) ||
         failed(distMaskByWarpOpOrFailure)) {
-      LLVM_DEBUG(
-          DBGS() << "LoadDistribution: failed to compute distributed types "
-                    "for offsets or mask, bailing out\n");
       return rewriter.notifyMatchFailure(
           loadGatherOp,
           "Some vector operands have no layouts, using defaults instead.");
@@ -1188,24 +1148,12 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-    LLVM_DEBUG(DBGS() << "LoadDistribution: operandIdx=" << operandIdx
-                      << ", distResultTy=" << distResultTy
-                      << ", distOffsetsTy=" << distOffsetsTy
-                      << ", distMaskTy=" << distMaskTy << "\n");
 
     SmallVector<Type> operandTypesToYield = {operands[0].getType(),
                                              distOffsetsTy, distMaskTy};
 
-    LLVM_DEBUG(DBGS() << "LoadDistribution: creating new warp op with "
-                      << operands.size() << " operands and "
-                      << operandTypesToYield.size() << " types to yield\n");
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-    LLVM_DEBUG({
-      DBGS() << "LoadDistribution: newRetIndices=[";
-      llvm::interleaveComma(newRetIndices, llvm::dbgs());
-      llvm::dbgs() << "]\n";
-    });
 
     rewriter.setInsertionPointAfter(newWarpOp);
 
@@ -1219,36 +1167,24 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
     VectorType distMaskTy1D =
         VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
                         distMaskByWarpOpOrFailure.value().getElementType());
-    LLVM_DEBUG(DBGS() << "LoadDistribution: 1D types: loadVecTy1D="
-                      << loadVecTy1D << ", distOffsetsTy1D=" << distOffsetsTy1D
-                      << ", distMaskTy1D=" << distMaskTy1D << "\n");
 
     Value distOffsetVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
     Value distmaskVal = resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
-    LLVM_DEBUG(DBGS() << "LoadDistribution: resolved distOffsetVal="
-                      << distOffsetVal << ", distmaskVal=" << distmaskVal
-                      << "\n");
 
     SmallVector<Value> newLoadGatherOperands = {
         newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
 
-    LLVM_DEBUG(DBGS() << "LoadDistribution: creating distributed "
-                         "LoadGatherOp with result type "
-                      << loadVecTy1D << "\n");
     xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
         rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
         loadGatherOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
-    LLVM_DEBUG(DBGS() << "LoadDistribution: created new op: " << newOp << "\n");
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type and replace all uses.
     rewriter.replaceAllUsesWith(
         distributedVal,
         resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
-    LLVM_DEBUG(DBGS() << "LoadDistribution: successfully distributed "
-                         "LoadGatherOp\n");
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 687617b5150cb..ade7276e52173 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -58,18 +58,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape,
     return std::make_pair(sgShape, count);
   auto sgData = layout.getEffectiveSgDataAsInt();
   count = computeProduct(distributedShape.value()) / computeProduct(sgData);
-  // auto sgLayout = layout.getEffectiveSgLayoutAsInt();
-  // auto sgData = layout.getEffectiveSgDataAsInt();
-  // SmallVector<int64_t> distUnit =
-  //     computeElementwiseMul(sgLayout, sgData);
-  // // Clamp distUnit to the original shape to handle wrap-around cases where
-  // data
-  // // is shared among subgroups, which may cause distUnit to exceed the
-  // original
-  // // shape.
-  // for (size_t i = 0; i < distUnit.size(); ++i)
-  //   distUnit[i] = std::min(shape[i], distUnit[i]);
-  // count = computeProduct(shape) / computeProduct(distUnit);
   return std::make_pair(sgData, count);
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 9884a46a7d5b8..26a211b043ead 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -103,35 +103,6 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
-// FailureOr<VectorType>
-// xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
-//                                        VectorType originalType) {
-//   if (!layout)
-//     return failure();
-//   assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
-//          "Expecting a valid layout.");
-//   SmallVector<int64_t> effectiveLaneLayout =
-//       layout.getEffectiveLaneLayoutAsInt();
-//   assert(static_cast<size_t>(originalType.getRank()) >=
-//              effectiveLaneLayout.size() &&
-//          "Rank of the original vector type should be greater or equal to the
-//          " "size of the lane layout to distribute the vector type.");
-//   SmallVector<int64_t> distributedShape(originalType.getShape());
-//   // Only distribute the last `laneLayout.size()` dimensions. The remaining
-//   // dimensions are not distributed.
-//   unsigned distributionStart =
-//       originalType.getRank() - effectiveLaneLayout.size();
-//   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-//     if (i < distributionStart)
-//       continue;
-//     // Check if the dimension can be distributed evenly.
-//     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-//       return failure();
-//     distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
-//   }
-//   return VectorType::get(distributedShape, originalType.getElementType());
-// }
-
 FailureOr<VectorType>
 xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                        VectorType originalType) {

>From f3239ecb591b5e07a3601cad3fd735ca0fd6648e Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 21:14:32 +0000
Subject: [PATCH 6/7] address feedback

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 18 +++++++++++------
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 20 ++-----------------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7917a0dde556c..733857c03ce78 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -279,10 +279,16 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                     /*retTy=*/"SmallVector<SmallVector<int64_t>>",
                     /*methodName=*/"computeStaticDistributedCoords",
                     /*args=*/(ins "int64_t":$linearId, "ArrayRef<int64_t>":$shape)>,
-    InterfaceMethod<[{Computes the per-compute-unit shape by dividing each dimension of
-                      `shape` by the corresponding layout factor (sg_layout or
-                      lane_layout). For wrap-around dimensions where the division is uneven,
-                      the tensor tile is broadcasted to all subgroups/lanes.}],
+    InterfaceMethod<[{Computes the distributed shape for each compute unit by dividing each
+                      dimension of `shape` by the corresponding layout factor (sg_layout or
+                      lane_layout).
+                      The distributed shape represents the per-compute-unit tile. Each
+                      distribution unit is defined as the combination of layout factors and
+                      per-unit data (`subshape`, e.g., sg_data or lane_data). When `shape`
+                      spans multiple distribution units, the distributed shape may contain
+                      multiple such units.
+                      For wrap-around dimensions where the division is uneven, the tensor tile
+                      is broadcast to all subgroups/lanes.}],
                     /*retTy=*/"FailureOr<SmallVector<int64_t>>",
                     /*methodName=*/"computeDistributedShape",
                     /*args=*/(ins "SmallVector<int64_t>":$shape),
@@ -303,8 +309,8 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                           "sgdata or lanedata cannot be empty for distributed shape computation");
                       SmallVector<int64_t> distributedShape(shape.size());
                       for (auto [i, dim] : llvm::enumerate(shape)) {
-                        int64_t distri_unit = layout[i]*subShape[i];
-                        if ((dim % distri_unit) == 0) {
+                        int64_t distriUnit = layout[i]*subShape[i];
+                        if ((dim % distriUnit) == 0) {
                           // Evenly divisible case, divide the dimension by the layout factor.
                           distributedShape[i] = dim / layout[i];
                           assert((distributedShape[i] % subShape[i] == 0) &&
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0fbd339146d7b..615a3b7828b77 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -409,15 +409,7 @@ LayoutAttr::computeStaticDistributedCoords(int64_t linearId,
 
   // Delinearize the linear ID using the order attribute.
   DenseI32ArrayAttr orderAttr = getOrder();
-  SmallVector<int64_t> order;
-  if (orderAttr && !orderAttr.empty()) {
-    order = llvm::map_to_vector(orderAttr.asArrayRef(), [](int32_t idx) {
-      return static_cast<int64_t>(idx);
-    });
-  } else {
-    order =
-        llvm::to_vector(llvm::reverse(llvm::seq<int64_t>(0, layoutVec.size())));
-  }
+  SmallVector<int64_t> order = getEffectiveOrderAsInt();
   SmallVector<int64_t> delinearizedId(layoutVec.size());
   int64_t remaining = linearId;
   for (size_t i = 0; i < order.size(); ++i) {
@@ -964,15 +956,7 @@ SliceAttr::computeStaticDistributedCoords(int64_t linearId,
     parentLayoutVec = parent.getEffectiveLaneLayoutAsInt();
 
   DenseI32ArrayAttr orderAttr = parent.getOrder();
-  SmallVector<int64_t> order;
-  if (orderAttr && !orderAttr.empty()) {
-    order = llvm::map_to_vector(orderAttr.asArrayRef(), [](int32_t idx) {
-      return static_cast<int64_t>(idx);
-    });
-  } else {
-    order = llvm::to_vector(
-        llvm::reverse(llvm::seq<int64_t>(0, parentLayoutVec.size())));
-  }
+  SmallVector<int64_t> order = parent.getEffectiveOrderAsInt();
   SmallVector<int64_t> allIds(parentLayoutVec.size());
   int64_t remaining = linearId;
   for (size_t i = 0; i < order.size(); ++i) {

>From 5cf6741fcb9bcf72dd3bf67083aa260b57af553c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 22:08:18 +0000
Subject: [PATCH 7/7] refactor with helpers

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 110 ++++++++----------
 .../Transforms/XeGPUSubgroupDistribute.cpp    |   3 +-
 2 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 615a3b7828b77..2ad63626c4b67 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -96,6 +96,31 @@ genCoordinates(OpBuilder &builder, Location loc,
   return coordinates;
 }
 
+static SmallVector<SmallVector<int64_t>> genStaticCoordinates(
+    llvm::ArrayRef<int64_t> canonicalIds, llvm::ArrayRef<int64_t> layout,
+    llvm::ArrayRef<int64_t> subShape, llvm::ArrayRef<int64_t> shape) {
+  // Compute distribution unit shape (clamped to srcShape).
+  SmallVector<int64_t> distUnitShape(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    distUnitShape[i] = std::min(shape[i], layout[i] * subShape[i]);
+
+  // Compute local offset of this ID within a distribution unit.
+  SmallVector<int64_t> localOffset(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    localOffset[i] = canonicalIds[i] * subShape[i];
+
+  // Enumerate all distribution units and compute coordinates.
+  SmallVector<SmallVector<int64_t>> coordinates;
+  for (SmallVector<int64_t> unitOffs :
+       StaticTileOffsetRange(shape, distUnitShape)) {
+    SmallVector<int64_t> coord(shape.size());
+    for (size_t i = 0; i < shape.size(); ++i)
+      coord[i] = (unitOffs[i] + localOffset[i]) % shape[i];
+    coordinates.push_back(coord);
+  }
+  return coordinates;
+}
+
 // Checks if the given shape can be evenly distributed based on the layout
 // and data factors provided by the LayoutAttr.
 bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
@@ -408,7 +433,6 @@ LayoutAttr::computeStaticDistributedCoords(int64_t linearId,
   assert(!subShape.empty() && "sgdata or lanedata cannot be empty");
 
   // Delinearize the linear ID using the order attribute.
-  DenseI32ArrayAttr orderAttr = getOrder();
   SmallVector<int64_t> order = getEffectiveOrderAsInt();
   SmallVector<int64_t> delinearizedId(layoutVec.size());
   int64_t remaining = linearId;
@@ -418,26 +442,7 @@ LayoutAttr::computeStaticDistributedCoords(int64_t linearId,
     remaining = remaining / layoutVec[dimIdx];
   }
 
-  // Compute distribution unit shape (clamped to srcShape).
-  SmallVector<int64_t> distUnitShape(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i)
-    distUnitShape[i] = std::min(shape[i], layoutVec[i] * subShape[i]);
-
-  // Compute local offset of this ID within a distribution unit.
-  SmallVector<int64_t> localOffset(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i)
-    localOffset[i] = delinearizedId[i] * subShape[i];
-
-  // Enumerate all distribution units and compute coordinates.
-  SmallVector<SmallVector<int64_t>> coordinates;
-  for (SmallVector<int64_t> unitOffs :
-       StaticTileOffsetRange(shape, distUnitShape)) {
-    SmallVector<int64_t> coord(shape.size());
-    for (size_t i = 0; i < shape.size(); ++i)
-      coord[i] = (unitOffs[i] + localOffset[i]) % shape[i];
-    coordinates.push_back(coord);
-  }
-  return coordinates;
+  return genStaticCoordinates(delinearizedId, layoutVec, subShape, shape);
 }
 
 // set the layout for unit dims: sg_data, inst_data and lane_data to 1
@@ -802,26 +807,27 @@ bool LayoutAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
                   other.getEffectiveLaneLayoutAsInt() &&
               getEffectiveLaneDataAsInt() == other.getEffectiveLaneDataAsInt());
   }
-  if (level == xegpu::LayoutKind::Subgroup) {
-    int64_t wgSize = computeProduct(getEffectiveSgLayoutAsInt());
-    for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
+
+  auto compareCoordsForAllIds = [&](int64_t size) {
+    for (int64_t id : llvm::seq<int64_t>(0, size)) {
       auto coords = computeStaticDistributedCoords(id, shape);
       auto otherCoords = other.computeStaticDistributedCoords(id, shape);
       if (coords != otherCoords)
         return false;
     }
+    return true;
+  };
+
+  if (level == xegpu::LayoutKind::Subgroup) {
+    int64_t wgSize = computeProduct(getEffectiveSgLayoutAsInt());
+    return compareCoordsForAllIds(wgSize);
   }
   if (level == xegpu::LayoutKind::InstData) {
     return (getEffectiveInstDataAsInt() == other.getEffectiveInstDataAsInt());
   }
   if (level == xegpu::LayoutKind::Lane) {
     int64_t subgroupSize = computeProduct(getEffectiveLaneLayoutAsInt());
-    for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
-      auto coords = computeStaticDistributedCoords(id, shape);
-      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
-      if (coords != otherCoords)
-        return false;
-    }
+    return compareCoordsForAllIds(subgroupSize);
   }
   return true;
 }
@@ -955,7 +961,6 @@ SliceAttr::computeStaticDistributedCoords(int64_t linearId,
   else
     parentLayoutVec = parent.getEffectiveLaneLayoutAsInt();
 
-  DenseI32ArrayAttr orderAttr = parent.getOrder();
   SmallVector<int64_t> order = parent.getEffectiveOrderAsInt();
   SmallVector<int64_t> allIds(parentLayoutVec.size());
   int64_t remaining = linearId;
@@ -972,26 +977,7 @@ SliceAttr::computeStaticDistributedCoords(int64_t linearId,
   SmallVector<int64_t> canonicalIds =
       XeGPUDialect::slice(ArrayRef<int64_t>(allIds), dims);
 
-  // Compute distribution unit shape (clamped to srcShape).
-  SmallVector<int64_t> distUnitShape(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i)
-    distUnitShape[i] = std::min(shape[i], layout[i] * subShape[i]);
-
-  // Compute local offset of this ID within a distribution unit.
-  SmallVector<int64_t> localOffset(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i)
-    localOffset[i] = canonicalIds[i] * subShape[i];
-
-  // Enumerate all distribution units and compute coordinates.
-  SmallVector<SmallVector<int64_t>> coordinates;
-  for (SmallVector<int64_t> unitOffs :
-       StaticTileOffsetRange(shape, distUnitShape)) {
-    SmallVector<int64_t> coord(shape.size());
-    for (size_t i = 0; i < shape.size(); ++i)
-      coord[i] = (unitOffs[i] + localOffset[i]) % shape[i];
-    coordinates.push_back(coord);
-  }
-  return coordinates;
+  return genStaticCoordinates(canonicalIds, layout, subShape, shape);
 }
 
 bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
@@ -1043,28 +1029,28 @@ bool SliceAttr::isCompatibleWith(const xegpu::DistributeLayoutAttr &other,
         return true;
   }
 
-  auto flattenedThis = flatten();
-  auto parent = dyn_cast<LayoutAttr>(flattenedThis.getParent());
-  if (level == xegpu::LayoutKind::Subgroup) {
-    int64_t wgSize = computeProduct(parent.getEffectiveSgLayoutAsInt());
-    for (int64_t id : llvm::seq<int64_t>(0, wgSize)) {
+  auto compareCoordsForAllIds = [&](int64_t size) {
+    for (int64_t id : llvm::seq<int64_t>(0, size)) {
       auto coords = computeStaticDistributedCoords(id, shape);
       auto otherCoords = other.computeStaticDistributedCoords(id, shape);
       if (coords != otherCoords)
         return false;
     }
+    return true;
+  };
+
+  auto flattenedThis = flatten();
+  auto parent = dyn_cast<LayoutAttr>(flattenedThis.getParent());
+  if (level == xegpu::LayoutKind::Subgroup) {
+    int64_t wgSize = computeProduct(parent.getEffectiveSgLayoutAsInt());
+    return compareCoordsForAllIds(wgSize);
   }
   if (level == xegpu::LayoutKind::InstData) {
     return (getEffectiveInstDataAsInt() == other.getEffectiveInstDataAsInt());
   }
   if (level == xegpu::LayoutKind::Lane) {
     int64_t subgroupSize = computeProduct(parent.getEffectiveLaneLayoutAsInt());
-    for (int64_t id : llvm::seq<int64_t>(0, subgroupSize)) {
-      auto coords = computeStaticDistributedCoords(id, shape);
-      auto otherCoords = other.computeStaticDistributedCoords(id, shape);
-      if (coords != otherCoords)
-        return false;
-    }
+    return compareCoordsForAllIds(subgroupSize);
   }
   return true;
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4ccdf000849b1..6d6725617d306 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1097,10 +1097,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
       return isa<xegpu::LoadGatherOp>(op) &&
              warpOp.getTerminator()->getPrevNode() == op;
     });
-    if (!producedByLastLoad) {
+    if (!producedByLastLoad)
       return rewriter.notifyMatchFailure(
           warpOp, "The last op is not xegpu::LoadGatherOp");
-    }
 
     auto loadGatherOp =
         producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();