[Mlir-commits] [mlir] [MLIR][XeGPU] Enhance multi-reduction layout propagation rules (PR #186308)

Fri Mar 20 07:33:56 PDT 2026

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/186308

>From af03ed33a0ad6c5afe2a73a17a06751035650b96 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 13 Mar 2026 03:35:06 +0000
Subject: [PATCH 1/9] improve multireduction layout support for high-d and
 consumer's nested slice attr

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 135 ++++++++++++------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |   2 -
 .../XeGPU/propagate-layout-subgroup.mlir      |  39 ++++-
 3 files changed, 124 insertions(+), 52 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index feefeb727a732..2f3044df0f1e7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -432,75 +432,118 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     return DenseI32ArrayAttr::get(context, vec32);
   };
 
-  // Extract original plain layout for workgroup/subgroup size recovery
-  xegpu::SliceAttr consumerSliceLayout =
-      dyn_cast<xegpu::SliceAttr>(consumerLayout);
-  DistributeLayoutAttr plainLayout =
-      consumerSliceLayout ? consumerSliceLayout.flatten().getParent()
-                          : consumerLayout;
+  // Helper lambda to check if the layout from consumer can be reused for the
+  // source shape
+  auto isLayoutCompatibleWithSrcShape =
+      [&](ArrayRef<int64_t> srcShape,
+          xegpu::DistributeLayoutAttr srcLayout) -> bool {
+    SmallVector<int64_t> sgLayout = srcLayout.getEffectiveSgLayoutAsInt();
+    SmallVector<int64_t> laneLayout = srcLayout.getEffectiveLaneLayoutAsInt();
+    if (!(srcLayout.getRank() == srcShape.size()))
+      return false;
+    for (int i = 0; i < srcShape.size(); i++) {
+      if (!sgLayout.empty() && srcShape[i] % sgLayout[i] != 0)
+        return false;
+      if (!laneLayout.empty() && srcShape[i] % laneLayout[i] != 0)
+        return false;
+    }
+    return true;
+  };
 
+  // Extract original plain layout for workgroup/subgroup size recovery
+  xegpu::DistributeLayoutAttr rootPlainLayout = consumerLayout;
+  while (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(rootPlainLayout)) {
+    rootPlainLayout = sliceAttr.getParent();
+  }
+  auto sgLayoutVec = rootPlainLayout.getEffectiveSgLayoutAsInt();
+  const int workgroupSize = std::accumulate(
+      sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
   const int subgroupSize = uArch->getSubgroupSize();
   int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
 
-  xegpu::DistributeLayoutAttr srcLayout;
+  xegpu::SliceAttr consumerSliceLayout =
+      dyn_cast<xegpu::SliceAttr>(consumerLayout);
+  SmallVector<int64_t> consumerSgLayout =
+      consumerLayout.getEffectiveSgLayoutAsInt();
+  SmallVector<int64_t> consumerLaneLayout =
+      consumerLayout.getEffectiveLaneLayoutAsInt();
+  SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
+  DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
 
+  xegpu::DistributeLayoutAttr srcLayout;
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
-    auto sgLayoutVec = plainLayout.getEffectiveSgLayoutAsInt();
-    const int workgroupSize = std::accumulate(
-        sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
-    SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank);
-    SmallVector<int64_t> consumerSgLayout =
-        consumerLayout.getEffectiveSgLayoutAsInt();
-    int remainingSgCount = workgroupSize;
-    int consumerIdx = consumerSgLayout.size() - 1;
-
-    // First pass: Match consumer's layout on non-reduction dimensions
-    for (int i = srcRank - 1; i >= 0; i--) {
-      if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
-        sgLayout[i] = consumerSgLayout[consumerIdx];
-        assert((srcShape[i] % sgLayout[i] == 0) &&
-               "source shape not divisible by consumer sg_layout");
-        sgData[i] = srcShape[i] / sgLayout[i];
-        remainingSgCount /= sgLayout[i];
-        consumerIdx--;
+    if (consumerSliceLayout &&
+        consumerSliceLayout.getDims().asArrayRef().equals(reductionDims) &&
+        isLayoutCompatibleWithSrcShape(srcShape,
+                                       consumerSliceLayout.getParent())) {
+      int64_t sgDataValue = -1;
+      srcLayout = consumerSliceLayout.getParent();
+      SmallVector<int64_t> sgLayoutFromConsumer =
+          srcLayout.getEffectiveSgLayoutAsInt();
+      for (int dim = 0; dim < srcRank; dim++) {
+        sgDataValue = srcShape[dim] / sgLayoutFromConsumer[dim];
+        srcLayout = srcLayout.setDimData(dim, sgDataValue, -1, -1);
       }
-    }
+    } else {
 
-    // Second pass: Distribute remaining subgroups across reduction dimensions
-    for (int i = srcRank - 1; i >= 0; i--) {
-      if (llvm::is_contained(reductionDims, i)) {
-        sgLayout[i] =
-            std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
-        assert((srcShape[i] % sgLayout[i] == 0) &&
-               "source shape not divisible by sg_layout");
-        sgData[i] = srcShape[i] / sgLayout[i];
-        remainingSgCount /= sgLayout[i];
+      SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
+      int remainingSgCount = workgroupSize;
+      int consumerIdx = consumerSgLayout.size() - 1;
+
+      // First pass: Match consumer's layout on non-reduction dimensions
+      for (int i = srcRank - 1; i >= 0; i--) {
+        if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
+          sgLayout[i] = consumerSgLayout[consumerIdx];
+          assert((srcShape[i] % sgLayout[i] == 0) &&
+                 "source shape not divisible by consumer sg_layout");
+          sgData[i] = srcShape[i] / sgLayout[i];
+          remainingSgCount /= sgLayout[i];
+          order[i] = consumerOrder[consumerIdx];
+          consumerIdx--;
+        }
       }
-    }
 
-    assert(remainingSgCount == 1 && "not all subgroups distributed");
-    srcLayout = xegpu::LayoutAttr::get(
-        context, toInt32Attr(sgLayout), toInt32Attr(sgData),
-        /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
-        /*lane_data =*/nullptr, /*order =*/nullptr);
+      // Second pass: Distribute remaining subgroups across reduction dimensions
+      int64_t remainOrder = consumerSgLayout.size();
+      for (int i = srcRank - 1; i >= 0; i--) {
+        if (llvm::is_contained(reductionDims, i)) {
+          sgLayout[i] =
+              std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
+          assert((srcShape[i] % sgLayout[i] == 0) &&
+                 "source shape not divisible by sg_layout");
+          sgData[i] = srcShape[i] / sgLayout[i];
+          remainingSgCount /= sgLayout[i];
+          order[i] = remainOrder++;
+        }
+      }
 
+      assert(remainingSgCount == 1 && "not all subgroups distributed");
+      srcLayout = xegpu::LayoutAttr::get(
+          context, toInt32Attr(sgLayout), toInt32Attr(sgData),
+          /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+          /*lane_data =*/nullptr, /*order =*/
+          (!orderAttr || orderAttr.empty()) ? nullptr : toInt32Attr(order));
+    }
   } else if (layoutKind == xegpu::LayoutKind::InstData) {
 
     SmallVector<int64_t> instData(srcRank, 1);
     instData[srcRank - 2] =
         std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
-    instData[srcRank - 1] = subgroupSize;
+    instData[srcRank - 1] =
+        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
 
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
 
     SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
-    laneLayout[srcRank - 1] = subgroupSize;
+    laneLayout[srcRank - 1] =
+        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     laneData[srcRank - 2] =
         std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
-    srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
-                                       toInt32Attr(laneData),
-                                       consumerLayout.getOrder());
+    srcLayout = xegpu::LayoutAttr::get(
+        context, toInt32Attr(laneLayout), toInt32Attr(laneData),
+        (!orderAttr || orderAttr.empty()) ? nullptr
+                                          : toInt32Attr(consumerOrder));
   }
 
   return xegpu::SliceAttr::get(context, srcLayout,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8bf0f2aca60c5..00008ae344d73 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1136,7 +1136,6 @@ void LayoutInfoPropagation::visitLoadMatrixOp(
   if (!hasParamsOfLayoutKind(anchorLayout)) {
     VectorType resVecTy =
         llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
-    assert(resVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
     const uArch *uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
     if (!uArch)
       return;
@@ -1157,7 +1156,6 @@ void LayoutInfoPropagation::visitStoreMatrixOp(
   } else {
     VectorType srcVecTy =
         llvm::cast<VectorType>(storeMatrix.getData().getType());
-    assert(srcVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
     const uArch *uArch = getUArch(getChipStr(storeMatrix).value_or(""));
     if (!uArch)
       return;
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index 39fd815b1b380..d730d04c819fa 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -128,8 +128,7 @@ gpu.module @test {
 gpu.module @test {
 // CHECK-LABEL: vector_row_reduction
 // CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
-  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes
-      {known_block_size = array<i32: 1, 32, 1>} {
+  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
     %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
     %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@@ -144,8 +143,7 @@ gpu.module @test {
 // -----
 gpu.module @test {
 // CHECK-LABEL: vector_nest_reduction
-  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes
-      {known_block_size = array<i32: 1, 32, 1>} {
+  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
     %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
     %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@@ -165,6 +163,39 @@ gpu.module @test {
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_nest_reduction_with_nest_slice_layout
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>, dims = [1]>} dense<0.000000e+00> : vector<32xf32>
+// CHECK: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>} dense<0.000000e+00> : vector<32x128xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32, #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>}>
+// CHECK-SAME: -> vector<32x128xf32>
+// CHECK: %[[BCAST1:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>} : vector<32x128xf32> to vector<4x32x128xf32>
+// CHECK: %[[REDUCE1:.*]] = vector.multi_reduction <add>, %[[BCAST1]], %[[CST0]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>} [0] : vector<4x32x128xf32> to vector<32x128xf32>
+// CHECK: %[[REDUCE2:.*]] = vector.multi_reduction <add>, %[[REDUCE1]], %[[CST]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>, dims = [1]>} [1] : vector<32x128xf32> to vector<32xf32>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} dense<true> : vector<32xi1>
+// CHECK: %[[OFFSET:.*]] = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} : vector<32xindex>
+// CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
+// CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
+// CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
+  gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+    %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
+    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
+    %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32>
+    %bcast1 = vector.broadcast %load: vector<32x128xf32> to vector<4x32x128xf32>
+    %bcast = vector.multi_reduction <add>, %bcast1, %cst1 [0]: vector<4x32x128xf32> to vector<32x128xf32>
+    %reduce = vector.multi_reduction <add>, %bcast, %cst [1] : vector<32x128xf32> to vector<32xf32>
+    %mask = arith.constant dense<1>: vector<32xi1>
+    %offset = vector.step : vector<32xindex>
+    xegpu.store %reduce, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
+    gpu.return
+  }
+}
+
 // -----
 gpu.module @test {
   // CHECK-LABEL: for_loop_dpas

>From 385eee510e7f5a3419c4db935e57f0d2da006c23 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 15:49:24 +0000
Subject: [PATCH 2/9] using computeShapeRtio and enhance layout.getNumSubgroups

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 20 ++++++---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 45 ++++---------------
 2 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index ce0cce65373e5..d16787297b302 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -193,12 +193,7 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                     "getRank">,
     InterfaceMethod<"Get the num of effective subgroups",
                     "int64_t",
-                    "getNumSubgroups", (ins), [{
-                        std::optional<SmallVector<int64_t>> sgLayout = llvm::cast<ConcreteAttr>(tablegen_opaque_val).getEffectiveSgLayoutAsInt();
-                        if (sgLayout.has_value())
-                          return computeProduct(*sgLayout);
-                        return 0;
-                    }], [{}]>,
+                    "getNumSubgroups">,
     InterfaceMethod<"Get the order of the layout attribute",
                     "DenseI32ArrayAttr",
                     "getOrder">,
@@ -464,6 +459,13 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return 0;
     }
 
+    int64_t getNumSubgroups() const {
+      auto sgLayout = getEffectiveSgLayoutAsInt();
+      if (!sgLayout.empty())
+        return computeProduct(sgLayout);
+      return 0;
+    }
+
     LayoutAttr dropSgLayoutAndData() const{
       // avoid every field of the attribute is nullptr, which may lead to segment fault
       if (!getInstData() && !getLaneLayout())
@@ -613,6 +615,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
       return parent.getRank() - attr.getDims().size();
     }
 
+    int64_t getNumSubgroups() const {
+      SliceAttr attr = flatten();
+      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+      return parent.getRank();
+    }
+
     DenseI32ArrayAttr getOrder() const {
       SliceAttr attr = flatten();
       auto parent = dyn_cast<LayoutAttr>(attr.getParent());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 2812efdaee27a..9236e0cea3a02 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -437,37 +437,10 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     return DenseI32ArrayAttr::get(context, vec32);
   };
 
-  // Helper lambda to check if the layout from consumer can be reused for the
-  // source shape
-  auto isLayoutCompatibleWithSrcShape =
-      [&](ArrayRef<int64_t> srcShape,
-          xegpu::DistributeLayoutAttr srcLayout) -> bool {
-    SmallVector<int64_t> sgLayout = srcLayout.getEffectiveSgLayoutAsInt();
-    SmallVector<int64_t> laneLayout = srcLayout.getEffectiveLaneLayoutAsInt();
-    if (static_cast<size_t>(srcLayout.getRank()) != srcShape.size())
-      return false;
-    for (size_t i = 0; i < srcShape.size(); i++) {
-      if (!sgLayout.empty() && srcShape[i] % sgLayout[i] != 0)
-        return false;
-      if (!laneLayout.empty() && srcShape[i] % laneLayout[i] != 0)
-        return false;
-    }
-    return true;
-  };
-
-  // Extract original plain layout for workgroup/subgroup size recovery
-  xegpu::DistributeLayoutAttr rootPlainLayout = consumerLayout;
-  while (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(rootPlainLayout)) {
-    rootPlainLayout = sliceAttr.getParent();
-  }
-  auto sgLayoutVec = rootPlainLayout.getEffectiveSgLayoutAsInt();
-  const int workgroupSize = std::accumulate(
-      sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
+  const int workgroupSize = consumerLayout.getNumSubgroups();
   const int subgroupSize = uArch->getSubgroupSize();
   int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
 
-  xegpu::SliceAttr consumerSliceLayout =
-      dyn_cast<xegpu::SliceAttr>(consumerLayout);
   SmallVector<int64_t> consumerSgLayout =
       consumerLayout.getEffectiveSgLayoutAsInt();
   SmallVector<int64_t> consumerLaneLayout =
@@ -477,18 +450,18 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
 
   xegpu::DistributeLayoutAttr srcLayout;
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
+    xegpu::SliceAttr consumerSliceLayout =
+        dyn_cast<xegpu::SliceAttr>(consumerLayout);
     if (consumerSliceLayout &&
-        consumerSliceLayout.getDims().asArrayRef().equals(reductionDims) &&
-        isLayoutCompatibleWithSrcShape(srcShape,
-                                       consumerSliceLayout.getParent())) {
-      int64_t sgDataValue = -1;
+        consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
       srcLayout = consumerSliceLayout.getParent();
       SmallVector<int64_t> sgLayoutFromConsumer =
           srcLayout.getEffectiveSgLayoutAsInt();
-      for (int dim = 0; dim < srcRank; dim++) {
-        sgDataValue = srcShape[dim] / sgLayoutFromConsumer[dim];
-        srcLayout = srcLayout.setDimData(dim, sgDataValue, -1, -1);
-      }
+      auto srcSgData = computeShapeRatio(srcShape, sgLayoutFromConsumer);
+      if (srcSgData)
+        for (int dim = 0; dim < srcRank; dim++) {
+          srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
+        }
     } else {
 
       SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);

>From 7beb79fef8f6cac970081247b753c4eddd969c5f Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 17:30:49 +0000
Subject: [PATCH 3/9] add comments

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 45 +++++++++++++------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 9236e0cea3a02..3e98b61bc385c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -388,7 +388,9 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 /// layout and data with the consumer's layout on non-reduction dimensions.
 /// Then, it distributes remaining subgroups across reduction dimensions. This
 /// avoids subgroup data redistribution overhead between the reduced result and
-/// its consumer.
+/// its consumer. When the consumer layout is a slice layout, it attempts to
+/// reuse the slice layout's parent layout for the source to further minimize
+/// potential data redistribution.
 ///
 /// InstData requries {1, ..., min(maxReduceVectorSize, srcShape),subgroupSize}
 /// Lane Layout requires {1, ..., 1, subgroupSize}
@@ -396,17 +398,31 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 ///
 /// Examples:
 ///   1. Subgroup layout - Row reduction on 2D tensor:
-///      srcShape=[32, 64], reductionDims=[1], resShape=[32], subgroupSize=16,
+///      srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
 ///      workgroupSize=32
 ///      Consumer Layout:
 ///      #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
-///      [1]>} Result: srcLayout with sgLayout=[4, 8], sgData=[8, 8] (matches
-///      consumer on non-reduction dim, minimizing data redistribution on
-///      reduction dim)
-///   2. Subgroup layout - Same example above but consumer has different layout:
-///      sgLayout=[32], sgData=[1]
-///      Result: srcLayout with sgLayout=[32,1], sgData=[1, 64]
-///      (distributes all subgroups on non reduction dim)
+///      [1]>}
+////     Result Layout: #xegpu.slice<#xegpu.layout<sg_layout=[4, 8],
+///      sg_data=[8, 16]>, dims = [1]>} Note that the sg_layout is reused but
+///      sg_data needs to be adjusted to evenly distribute the source tensor
+///      tile among the reduction dim.
+///   2. Subgroup layout - Same example above but consumer doesn't have a
+///   reusable slice layout.
+///      Consumer Layout:
+///      #xegpu.layout<sgLayout=[32], sgData=[1]>
+///      Result Layout:
+///      #xegpu.slice<#xegpu.layout<sgLayout=[32,1], sgData=[1, 64]>, dims =
+///      [1]>}
+///      Consumer Layout:
+///      #xegpu.slice<#xegpu.layout<sgLayout=[8, 2, 4], sgData=[4, 64, 32]>,
+///      dims = [1, 2]>} Result Layout:
+///      #xegpu.slice<#xegpu.layout<sgLayout=[8,4], sgData=[4, 32]>, dims =
+///      [1]>}
+///      Note that the consumer's layout can't be directly reused as is.
+///      So the algorithm distributes all subgroups on non reduction dimensions
+///      first and then distribute remaining subgroups on the reduction
+///      dimension.
 ///
 ///   2. InstData layout - Column reduction:
 ///      srcShape=[32, 64], reductionDims=[0], subgroupSize=16
@@ -466,24 +482,25 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
 
       SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
       int remainingSgCount = workgroupSize;
-      int consumerIdx = consumerSgLayout.size() - 1;
+      int consumerIdx = 0;
 
       // First pass: Match consumer's layout on non-reduction dimensions
-      for (int i = srcRank - 1; i >= 0; i--) {
-        if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
+      for (int i = 0; i < srcRank; i++) {
+        if (!llvm::is_contained(reductionDims, i) &&
+            consumerIdx < consumerSgLayout.size()) {
           sgLayout[i] = consumerSgLayout[consumerIdx];
           assert((srcShape[i] % sgLayout[i] == 0) &&
                  "source shape not divisible by consumer sg_layout");
           sgData[i] = srcShape[i] / sgLayout[i];
           remainingSgCount /= sgLayout[i];
           order[i] = consumerOrder[consumerIdx];
-          consumerIdx--;
+          consumerIdx++;
         }
       }
 
       // Second pass: Distribute remaining subgroups across reduction dimensions
       int64_t remainOrder = consumerSgLayout.size();
-      for (int i = srcRank - 1; i >= 0; i--) {
+      for (int i = 0; i < srcRank; i++) {
         if (llvm::is_contained(reductionDims, i)) {
           sgLayout[i] =
               std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));

>From 78a5f6c71cdb20797c67005a6400cf6314f3d76a Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 17:34:01 +0000
Subject: [PATCH 4/9] polish comments

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 3e98b61bc385c..917e88000b0c2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -400,25 +400,28 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 ///   1. Subgroup layout - Row reduction on 2D tensor:
 ///      srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
 ///      workgroupSize=32
-///      Consumer Layout:
-///      #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
-///      [1]>}
-////     Result Layout: #xegpu.slice<#xegpu.layout<sg_layout=[4, 8],
-///      sg_data=[8, 16]>, dims = [1]>} Note that the sg_layout is reused but
-///      sg_data needs to be adjusted to evenly distribute the source tensor
-///      tile among the reduction dim.
+///      * Consumer Layout:
+///        #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
+///        [1]>}
+////     * Result Layout:
+///        #xegpu.slice<#xegpu.layout<sg_layout=[4, 8],sg_data=[8, 16]>, dims =
+///        [1]>}
+///      Note that the sg_layout is reused but sg_data needs to be adjusted to
+///      evenly distribute the source tensor tile among the reduction dim.
+///
 ///   2. Subgroup layout - Same example above but consumer doesn't have a
 ///   reusable slice layout.
-///      Consumer Layout:
-///      #xegpu.layout<sgLayout=[32], sgData=[1]>
-///      Result Layout:
-///      #xegpu.slice<#xegpu.layout<sgLayout=[32,1], sgData=[1, 64]>, dims =
-///      [1]>}
-///      Consumer Layout:
-///      #xegpu.slice<#xegpu.layout<sgLayout=[8, 2, 4], sgData=[4, 64, 32]>,
-///      dims = [1, 2]>} Result Layout:
-///      #xegpu.slice<#xegpu.layout<sgLayout=[8,4], sgData=[4, 32]>, dims =
-///      [1]>}
+///      * Consumer Layout:
+///        #xegpu.layout<sgLayout=[32], sgData=[1]>
+///      * Result Layout:
+///        #xegpu.slice<#xegpu.layout<sgLayout=[32,1], sgData=[1, 64]>, dims =
+///        [1]>}
+///      * Consumer Layout:
+///        #xegpu.slice<#xegpu.layout<sgLayout=[8, 2, 4], sgData=[4, 64, 32]>,
+///      dims = [1, 2]>}
+///      * Result Layout:
+///        #xegpu.slice<#xegpu.layout<sgLayout=[8,4], sgData=[4, 32]>, dims =
+///        [1]>}
 ///      Note that the consumer's layout can't be directly reused as is.
 ///      So the algorithm distributes all subgroups on non reduction dimensions
 ///      first and then distribute remaining subgroups on the reduction

>From d839b2ddddb8d463b5bb920cb77fbcc5f95b02e8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 18:36:19 +0000
Subject: [PATCH 5/9] adding tests

---
 .../XeGPU/propagate-layout-inst-data.mlir     | 18 +++++++++++++
 .../XeGPU/propagate-layout-subgroup.mlir      | 10 +++----
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 27 +++++++++++++++++++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 503fb25deb151..70e7010ce2367 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -297,6 +297,24 @@ func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size(
+// CHECK: %[[ReduceVal:.*]] = vector.multi_reduction <add>, %[[Val:.*]], %[[CST:.*]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x16x1xf16> to vector<1xf16>
+func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<16xi1>
+    %0 = vector.step : vector<16xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+    %2 = vector.shape_cast %1 : vector<16xf16> to vector<1x16x1xf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x16x1xf16> to vector<1xf16>
+    %cst_2 = arith.constant dense<true> : vector<1xi1>
+    %cst_3 = arith.constant dense<1> : vector<1xindex>
+    xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [1, 2]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index d730d04c819fa..e4e6d61b92fda 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -289,13 +289,13 @@ gpu.module @test {
 // -----
 gpu.module @xevm_module{
   // CHECK-LABEL: load_store_matrix
-  gpu.func @load_store_matrix(%arg0: !xegpu.mem_desc<64x128xf32>, %sg_id_lt_2: i1) {
+  gpu.func @load_store_matrix(%arg0: !xegpu.mem_desc<1x64x128xf32>, %sg_id_lt_2: i1) {
     %c0 = arith.constant 0 : index
     scf.if %sg_id_lt_2 {
-      // CHECK: xegpu.load_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}>
-      %1 = xegpu.load_matrix %arg0[%c0, %c0] : !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
-      // CHECK: xegpu.store_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}>
-      xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}> : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+      // CHECK: xegpu.load_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}>
+      %1 = xegpu.load_matrix %arg0[%c0, %c0, %c0] : !xegpu.mem_desc<1x64x128xf32>, index, index, index -> vector<1x32x32xf32>
+      // CHECK: xegpu.store_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}>
+      xegpu.store_matrix %1, %arg0[%c0, %c0, %c0] <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}> : vector<1x32x32xf32>, !xegpu.mem_desc<1x64x128xf32>, index, index, index
     }
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index ddd2d22108d1f..19bd7037ed71d 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -688,6 +688,33 @@ func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1
     return
   }
 }
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [1, 2]>} dense<0.000000e+00> : vector<1xf16>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x1x1xf16> to vector<1xf16>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<1> : vector<1xindex>
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<1xi1>
+    %0 = vector.step : vector<1xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+    %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x1x1xf16> to vector<1xf16>
+    %cst_2 = arith.constant dense<true> : vector<1xi1>
+    %cst_3 = arith.constant dense<1> : vector<1xindex>
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(

>From 281b2c13a08b69c3325aaa0a84ff6beabc950866 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Mar 2026 23:57:43 +0000
Subject: [PATCH 6/9] adding test

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp       |  2 +-
 .../XeGPU/propagate-layout-inst-data.mlir      | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 917e88000b0c2..fa3ac8965f7cc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -490,7 +490,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
       // First pass: Match consumer's layout on non-reduction dimensions
       for (int i = 0; i < srcRank; i++) {
         if (!llvm::is_contained(reductionDims, i) &&
-            consumerIdx < consumerSgLayout.size()) {
+            consumerIdx < static_cast<int>(consumerSgLayout.size())) {
           sgLayout[i] = consumerSgLayout[consumerIdx];
           assert((srcShape[i] % sgLayout[i] == 0) &&
                  "source shape not divisible by consumer sg_layout");
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 70e7010ce2367..5a1a43e248f91 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -315,6 +315,24 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_4(
+// CHECK: %[[ReduceVal:.*]] = vector.multi_reduction <add>, %[[Val:.*]], %[[CST:.*]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2] : vector<1x16x4xf16> to vector<1xf16>
+func.func @vector_2d_reduction_with_fractional_subgroup_size_4(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<64xi1>
+    %0 = vector.step : vector<64xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<64xindex>, vector<64xi1> -> vector<64xf16>
+    %2 = vector.shape_cast %1 : vector<64xf16> to vector<1x16x4xf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x16x4xf16> to vector<1xf16>
+    %cst_2 = arith.constant dense<true> : vector<1xi1>
+    %cst_3 = arith.constant dense<1> : vector<1xindex>
+    xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [1, 2]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(

>From e77ca493343a7f7e248340993d65d458ad1ef6a3 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Mar 2026 00:07:05 +0000
Subject: [PATCH 7/9] adding test

---
 .../XeGPU/propagate-layout-inst-data.mlir     |  4 +--
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 26 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 5a1a43e248f91..5a95185c8de48 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -317,9 +317,9 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
 
 // -----
 gpu.module @test {
-// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_4(
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4x1(
 // CHECK: %[[ReduceVal:.*]] = vector.multi_reduction <add>, %[[Val:.*]], %[[CST:.*]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2] : vector<1x16x4xf16> to vector<1xf16>
-func.func @vector_2d_reduction_with_fractional_subgroup_size_4(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4x1(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
     %cst = arith.constant dense<true> : vector<64xi1>
     %0 = vector.step : vector<64xindex>
     %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<64xindex>, vector<64xi1> -> vector<64xf16>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 19bd7037ed71d..182437f2ff649 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -715,6 +715,32 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [4], lane_data = [1]>} dense<true> : vector<4xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [4], lane_data = [1]>} : vector<4xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [4], lane_data = [1]>}> : memref<1024xf16>, vector<4xindex>, vector<4xi1> -> vector<4xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>} : vector<4xf16> to vector<1x4xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>, dims = [1, 2]>} dense<0.000000e+00> : vector<1xf16>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x4xf16> to vector<1xf16>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<1> : vector<1xindex>
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<4xi1>
+    %0 = vector.step : vector<4xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<4xindex>, vector<4xi1> -> vector<4xf16>
+    %2 = vector.shape_cast %1 : vector<4xf16> to vector<1x4xf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x4xf16> to vector<1xf16>
+    %cst_2 = arith.constant dense<true> : vector<1xi1>
+    %cst_3 = arith.constant dense<1> : vector<1xindex>
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(

>From a0be560c09e924aa2e08cab717ebd836a3917ded Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Mar 2026 05:50:35 +0000
Subject: [PATCH 8/9] fix hidden bugs

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td      | 4 +---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 6 ++----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index d16787297b302..8561226af47f6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -616,9 +616,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     }
 
     int64_t getNumSubgroups() const {
-      SliceAttr attr = flatten();
-      auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-      return parent.getRank();
+      return getParent().getNumSubgroups();
     }
 
     DenseI32ArrayAttr getOrder() const {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index fa3ac8965f7cc..e326fe2316702 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -538,10 +538,8 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
         std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     laneData[srcRank - 2] =
         std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
-    srcLayout = xegpu::LayoutAttr::get(
-        context, toInt32Attr(laneLayout), toInt32Attr(laneData),
-        (!orderAttr || orderAttr.empty()) ? nullptr
-                                          : toInt32Attr(consumerOrder));
+    srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
+                                       toInt32Attr(laneData));
   }
 
   return xegpu::SliceAttr::get(context, srcLayout,

>From 1b031f261f90add6c7d22d94ec5edc49593cfb68 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Mar 2026 14:33:26 +0000
Subject: [PATCH 9/9] add tests

---
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 182437f2ff649..fee13b8d3e128 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -741,6 +741,36 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(%arg0: memref<1
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK: func.func @vector_reduction_broadcast_transpose(%[[ARG0:.*]]: memref<1024x64xf32>, %[[ARG1:.*]]: memref<1024x64xf32>)
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0xFF800000> : vector<1x16xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>} dense<0.000000e+00> : vector<8xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST_0]], %[[CST]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
+// CHECK: %[[INS:.*]] = vector.insert_strided_slice %[[RED]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>, offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
+// CHECK: %[[ADD:.*]] = arith.addf %[[INS]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>} : vector<8xf32>
+// CHECK: %[[BC:.*]] = vector.broadcast %[[ADD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>} : vector<8xf32> to vector<16x8xf32>
+// CHECK: %[[TR:.*]] = vector.transpose %[[BC]], [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x8xf32> to vector<8x16xf32>
+// CHECK: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x64xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[TR]], %[[DESC]][%[[C0]], %[[C0]]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @vector_reduction_broadcast_transpose(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+    %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+    %cst_0 = arith.constant dense<0xFF800000> : vector<1x16xf32>
+    %cst_1 = arith.constant dense<0.000000e+00> : vector<8xf32>
+    %c0 = arith.constant 0 : index
+    %100 = vector.multi_reduction <add>, %cst_0, %cst [1] : vector<1x16xf32> to vector<1xf32>
+    %157 = vector.insert_strided_slice %100, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
+    %165 = arith.addf %157, %cst_1 : vector<8xf32>
+    %166 = vector.broadcast %165 : vector<8xf32> to vector<16x8xf32>
+    %168 = vector.transpose %166, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
+    %172 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    xegpu.store_nd %168, %172[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(