[Mlir-commits] [mlir] [MLIR][XeGPU] Enable one-step subgroup distribution of cross-lane reduction to shuffle op (PR #182698)

Tue Feb 24 14:51:00 PST 2026

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/182698

>From 73a49452cbb9172d05a26845f540e78a28a803ff Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Feb 2026 18:39:47 +0000
Subject: [PATCH 01/14] change subgroup distribution of reduction cross lane to
 use shuffle directly

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   7 +
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  77 ++++++++---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 122 ++++++++++++++++--
 3 files changed, 176 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index ebf50c4cd57de..91b7c2202e56b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,6 +147,13 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
                               vector::CombiningKind kind, int64_t reductionDim,
                               Location loc, PatternRewriter &rewriter);
 
+Value lowerToVectorReductionsCrossLane(TypedValue<VectorType> src,
+                                       TypedValue<VectorType> acc,
+                                       vector::CombiningKind kind,
+                                       int64_t reductionDim,
+                                       int64_t reductionSize, Location loc,
+                                       PatternRewriter &rewriter);
+
 /// Helper Function to find a proper instruction multiple for the user-supplied
 /// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
 /// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 99c2da386fab6..d82766f61338a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1337,6 +1337,21 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
 
+    // before get distribute vec type for source, first set its shape to be unit
+    // for the reduction dimension
+    SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
+                                        sourceType.getShape().end());
+    int64_t reductionDimSize = sourceShape[reductionDim];
+    // sourceShape[reductionDim] = 1;
+    // VectorType modifiedSourceType =
+    //     VectorType::get(sourceShape, sourceType.getElementType());
+
+    // print out modifiedSourceType and sourceLayout for debugging
+    //  LLVM_DEBUG({
+    //    llvm::dbgs() << "modifiedSourceType: " << modifiedSourceType << "\n";
+    //    llvm::dbgs() << "sourceLayout: " << sourceLayout << "\n";
+    //  });
+
     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
     if (failed(sourceDistTypeOrFailure))
@@ -1372,6 +1387,17 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
 
     bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
                                 (sourceDistDim == 1 && reductionDim == 0);
+    // print here all these five variables for debugging
+    LLVM_DEBUG({
+      llvm::dbgs() << "sourceDistDim: " << sourceDistDim << "\n";
+      llvm::dbgs() << "reductionDim: " << reductionDim << "\n";
+      llvm::dbgs() << "isReductionLaneLocal: " << isReductionLaneLocal << "\n";
+      llvm::dbgs() << "resultDistributed: " << resultDistributed << "\n";
+      llvm::dbgs() << "sourceDistType: " << sourceDistType << "\n";
+      llvm::dbgs() << "distributedResultType: " << distributedResultType
+                   << "\n";
+    });
+
     if (isReductionLaneLocal && !resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed result for lane-local reduction.");
@@ -1381,33 +1407,46 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
           warpOp,
           "Expecting a broadcasted result for non-lane-local reduction.");
 
+    // Yield the source and acc vectors from the WarpOp.
+    SmallVector<size_t> newRetIndices;
+    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
+        {sourceDistType, distributedResultType}, newRetIndices);
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    Value result;
     // Handle lane-local reduction case. In this case we fully distribute the
     // reduction result.
     if (isReductionLaneLocal) {
-      // Yield the source and acc vectors from the WarpOp.
-      SmallVector<size_t> newRetIndices;
-      auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-          rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
-          {sourceDistType, distributedResultType}, newRetIndices);
-      rewriter.setInsertionPointAfter(newWarpOp);
-      Value result = xegpu::lowerToVectorReductions(
+
+      result = xegpu::lowerToVectorReductions(
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
           reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
-      // Replace the warp op result with the final result.
-      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
-      return success();
+      // print the reduction op for debugging
+      LLVM_DEBUG({
+        llvm::dbgs() << "reductionOp1: " << *reductionOp << "\n";
+        llvm::dbgs() << "lowered reduction result1: " << result << "\n";
+      });
+
+    } else {
+      // For non-lane-local case, we simply rewrite the MultiReductionOp in
+      // terms of multiple ReductionOps. Actual distribution is done by the
+      // WarpOpReduction pattern.
+      // rewriter.setInsertionPointAfter(reductionOp);
+      result = xegpu::lowerToVectorReductionsCrossLane(
+          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
+          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
+          reductionOp.getKind(), reductionDim, reductionDimSize,
+          reductionOp.getLoc(), rewriter);
+      // print the reduction op for debugging
+      LLVM_DEBUG({
+        llvm::dbgs() << "reductionOp2: " << *reductionOp << "\n";
+        llvm::dbgs() << "lowered reduction result2: " << result << "\n";
+      });
     }
-    // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
-    // of multiple ReductionOps. Actual distribution is done by the
-    // WarpOpReduction pattern.
-    rewriter.setInsertionPointAfter(reductionOp);
-    Value result = xegpu::lowerToVectorReductions(
-        cast<TypedValue<VectorType>>(reductionOp.getSource()),
-        cast<TypedValue<VectorType>>(reductionOp.getAcc()),
-        reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
     // Replace the warp op result with the final result.
-    rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 5fdab1e759deb..301b779c955d2 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -121,13 +121,36 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
   // dimensions are not distributed.
   unsigned distributionStart =
       originalType.getRank() - effectiveLaneLayout.size();
+
+  // Print original shape and lane layout for debugging
+  std::string shapeStr = "[";
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i > 0)
+      shapeStr += ", ";
+    shapeStr += std::to_string(dim);
+  }
+  shapeStr += "]";
+  LDBG() << "original shape: " << shapeStr;
+
+  std::string layoutStr = "[";
+  for (auto [i, dim] : llvm::enumerate(effectiveLaneLayout)) {
+    if (i > 0)
+      layoutStr += ", ";
+    layoutStr += std::to_string(dim);
+  }
+  layoutStr += "]";
+  LDBG() << "effective lane layout: " << layoutStr;
+
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
     // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-      return failure();
-    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+    if (dim % effectiveLaneLayout[i - distributionStart] != 0) {
+      assert( effectiveLaneLayout[i - distributionStart] % dim == 0 &&
+              "The dimension size must be able evenly distributed to all lanes in round-robin manner.");
+      distributedShape[i] = 1;
+    } else
+      distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
   }
   return VectorType::get(distributedShape, originalType.getElementType());
 }
@@ -682,10 +705,10 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
-  auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
-  auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
-  // Reduction result should have the same layout as the accumulator.
-  xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+  // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+  // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+  // // Reduction result should have the same layout as the accumulator.
+  // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
@@ -702,7 +725,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
     // Extract strided slice has the same layout as src.
-    xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
+    // xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
 
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
 
@@ -713,8 +736,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
 
     // Shape cast output has the same layout as the accumulator. Shape cast
     // source has the same layout as the original reduction source.
-    xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
-    xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
+    // xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+    // xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
     // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
     Value reduction = vector::ReductionOp::create(
@@ -722,7 +745,84 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
     reductionResult =
         vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
     // Insert op should have the same layout as the accumulator.
-    xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+    // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+  }
+  return reductionResult;
+}
+
+Value xegpu::lowerToVectorReductionsCrossLane(
+    TypedValue<VectorType> src, TypedValue<VectorType> acc,
+    vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
+    Location loc, PatternRewriter &rewriter) {
+  // Expecting a 2D source vector.
+  assert(src.getType().getRank() == 2 && "expected a 2D source vector");
+  VectorType sourceType = src.getType();
+  int64_t sourceH = sourceType.getShape()[0];
+  int64_t sourceW = sourceType.getShape()[1];
+  int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+  // Create a constant vector to hold the result of the reduction.
+  TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
+  Value reductionResult = arith::ConstantOp::create(
+      rewriter, loc, acc.getType(),
+      DenseElementsAttr::get(acc.getType(), zeroAttr));
+  // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+  // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+  // // Reduction result should have the same layout as the accumulator.
+  // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+
+  // print source shape, reduction dim and reduction size for debugging
+  std::string shapeStr = "[";
+  for (auto [i, dim] : llvm::enumerate(sourceType.getShape())) {
+    if (i > 0)
+      shapeStr += ", ";
+    shapeStr += std::to_string(dim);
+  }
+  shapeStr += "]";
+  LDBG() << "source shape: " << shapeStr;
+  LDBG() << "reduction dim: " << reductionDim;
+  LDBG() << "reduction size: " << reductionSize;
+
+  // For each slice of the source, extract the slice vector, do a reduction
+  // and, insert the reduced value back to the result vector.
+  for (int i = 0; i < nSlices; ++i) {
+    SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
+    if (reductionDim == 1) {
+      sliceOffsets = {i, 0};
+      sliceSizes = {1, sourceW};
+    } else {
+      sliceOffsets = {0, i};
+      sliceSizes = {sourceH, 1};
+    }
+
+    // print src, sliceOffsets, sliceSizes for debugging
+    LDBG() << "src: " << src;
+    LDBG() << "sliceOffsets: [" << sliceOffsets[0] << ", " << sliceOffsets[1]
+           << "]";
+    LDBG() << "sliceSizes: [" << sliceSizes[0] << ", " << sliceSizes[1] << "]";
+
+    vector::ExtractStridedSliceOp extractOp =
+        vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
+                                              sliceSizes, {1, 1});
+
+    int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+
+    vector::ShapeCastOp slice = vector::ShapeCastOp::create(
+        rewriter, loc,
+        VectorType::get({nSliceElements}, sourceType.getElementType()),
+        extractOp.getResult());
+
+    // Extract and reduction results in scalars, so no result layout is needed.
+    Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
+
+    // Distribute and reduce across work-items in the subgroup.
+    Value fullReduce =
+        xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
+
+    fullReduce =
+        vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
+
+    reductionResult =
+        vector::InsertOp::create(rewriter, loc, fullReduce, reductionResult, i);
   }
   return reductionResult;
 }

>From 0b5be3187723bed4ea2513851459d6ee86d55620 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Feb 2026 23:57:56 +0000
Subject: [PATCH 02/14] add temporary layout back

---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 301b779c955d2..e0a1de1af45aa 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -705,10 +705,10 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
-  // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
-  // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
-  // // Reduction result should have the same layout as the accumulator.
-  // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+  auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+  auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+  // Reduction result should have the same layout as the accumulator.
+  xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
@@ -725,7 +725,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
     // Extract strided slice has the same layout as src.
-    // xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
+    xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
 
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
 
@@ -736,8 +736,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
 
     // Shape cast output has the same layout as the accumulator. Shape cast
     // source has the same layout as the original reduction source.
-    // xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
-    // xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
+    xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+    xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
     // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
     Value reduction = vector::ReductionOp::create(
@@ -745,7 +745,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
     reductionResult =
         vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
     // Insert op should have the same layout as the accumulator.
-    // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+    xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
   }
   return reductionResult;
 }

>From d22fdd2cfec585c6628aa318a80c8f4d8b19d402 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 01:31:32 +0000
Subject: [PATCH 03/14] modifying tests

---
 .../XeGPU/subgroup-distribute-unit.mlir       | 74 ++++++++++++-------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index fb23f38b44b46..60fe29b7c9338 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -283,14 +283,30 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
   gpu.return
 }
 
-
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
-// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:   %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:   %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
+// CHECK:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK:   %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK:   %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
+// CHECK:   %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[T2]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
+// CHECK:   %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
+// CHECK:   %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK:   %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK:   %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[T1]] : f32
+// CHECK:   %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK:   %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
+// CHECK:   %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle  xor %[[T9]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
+// CHECK:   %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
+// CHECK:   %[[SHUFFLE7:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUFFLE7]] : f32
+// CHECK:   %[[SHUFFLE8:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUFFLE8]] : f32
+// CHECK:   %[[FINAL2:.*]] = arith.addf %[[ADD8]], %[[T8]] : f32
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -352,27 +368,29 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
 
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:       %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK:       %[[SRC:.*]] = "some_def"()
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-// CHECK-SAME:    : () -> vector<16x2xf32>
-// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]]
-// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
-// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]]
-// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
+// CHECK:       %[[V0:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[V1:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<1x2xf32>
+// CHECK:       %[[SHUF1:.*]], %[[VALID1:.*]] = gpu.shuffle  xor %[[V1]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD1:.*]] = arith.addf %[[V1]], %[[SHUF1]] : f32
+// CHECK:       %[[SHUF2:.*]], %[[VALID2:.*]] = gpu.shuffle  xor %[[ADD1]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUF2]] : f32
+// CHECK:       %[[SHUF3:.*]], %[[VALID3:.*]] = gpu.shuffle  xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUF3]] : f32
+// CHECK:       %[[SHUF4:.*]], %[[VALID4:.*]] = gpu.shuffle  xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUF4]] : f32
+// CHECK:       %[[RES0:.*]] = arith.addf %[[ADD4]], %[[V0]] : f32
+// CHECK:       %[[V2:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[V3:.*]] = vector.extract %{{.*}}#1[0, 1] : f32 from vector<1x2xf32>
+// CHECK:       %[[SHUF5:.*]], %[[VALID5:.*]] = gpu.shuffle  xor %[[V3]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD5:.*]] = arith.addf %[[V3]], %[[SHUF5]] : f32
+// CHECK:       %[[SHUF6:.*]], %[[VALID6:.*]] = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUF6]] : f32
+// CHECK:       %[[SHUF7:.*]], %[[VALID7:.*]] = gpu.shuffle  xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUF7]] : f32
+// CHECK:       %[[SHUF8:.*]], %[[VALID8:.*]] = gpu.shuffle  xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
+// CHECK:       %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUF8]] : f32
+// CHECK:       %[[RES1:.*]] = arith.addf %[[ADD8]], %[[V2]] : f32
+// CHECK:       %[[RESULT:.*]] = vector.from_elements %[[RES0]], %[[RES1]] : vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {

>From 89ca00db91d280fa396da7c37b42601517e8b975 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 02:50:36 +0000
Subject: [PATCH 04/14] remove empty lines

---
 mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 60fe29b7c9338..f8d71350b7c52 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,8 +329,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
-
-
 // CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
 // CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
 // CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {

>From ad00de29fd88e8df3952c5da7936963eb4869af5 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 03:27:08 +0000
Subject: [PATCH 05/14] adding tests

---
 .../XeGPU/subgroup-distribute-unit.mlir       | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index f8d71350b7c52..1cea44af57459 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,6 +329,44 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size
+// CHECK:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK:   %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK:   %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
+// CHECK:   %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[T2]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
+// CHECK:   %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
+// CHECK:   %[[FINAL:.*]] = arith.addf %[[ADD2]], %[[T1]] : f32
+// CHECK:   %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK:   %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
+// CHECK:   %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle  xor %[[T9]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
+// CHECK:   %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK:   %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
+// CHECK:   %[[FINAL2:.*]] = arith.addf %[[ADD6]], %[[T8]] : f32
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x4xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<16x4xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+
 // CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
 // CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
 // CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {

>From a25ff3504aa4b3affb90f7ecdf923ee81901cd5a Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 04:50:08 +0000
Subject: [PATCH 06/14] adding support to new subgroup distribution mechanism

---
 .../XeGPUSgToWiDistributeExperimental.cpp     |  88 ++++++---
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 174 ++++++++++--------
 .../XeGPU/subgroup-distribute-unit.mlir       |  38 ----
 3 files changed, 168 insertions(+), 132 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..4237507c7f016 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -469,22 +469,48 @@ struct SgToWiMultiDimReduction
   LogicalResult
   matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    Value result;
     // Only lane-local reduction is handled here.
-    if (!isReductionLaneLocal(op))
-      return rewriter.notifyMatchFailure(
-          op, "Only lane-local reduction is supported, expected reduction "
-              "dimension to be "
-              "not distributed.");
-    auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
-    VectorType resVecTy = dyn_cast<VectorType>(op.getType());
-    auto resDistVecTyOrFailure =
-        getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
-    // Simply create a new MultiDimReductionOp using adaptor operands and the
-    // new result type.
-    auto newOp = vector::MultiDimReductionOp::create(
-        rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
-        adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
-    rewriter.replaceOp(op, newOp.getResult());
+    if (isReductionLaneLocal(op)) {
+      auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+      VectorType resVecTy = dyn_cast<VectorType>(op.getType());
+      auto resDistVecTyOrFailure =
+          getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
+      // Simply create a new MultiDimReductionOp using adaptor operands and the
+      // new result type.
+      result = vector::MultiDimReductionOp::create(
+          rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
+          adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
+    } else {
+      ArrayRef<int64_t> reductionDims = op.getReductionDims();
+      assert(reductionDims.size() == 1 &&
+             "Expecting single reduction dimension for subgroup multi "
+             "reduction op");
+      // print adaptor.getSource() and adaptor.getAcc() for debugging
+      LLVM_DEBUG({
+        llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
+        llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
+      });
+      // before get distribute vec type for source, first set its shape to be
+      // unit
+      // for the reduction dimension
+      auto reductionDim = reductionDims[0];
+      VectorType sourceType = op.getSourceVectorType();
+      SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
+                                          sourceType.getShape().end());
+      int64_t reductionDimSize = sourceShape[reductionDim];
+      result = xegpu::lowerToVectorReductionsCrossLane(
+          cast<TypedValue<VectorType>>(adaptor.getSource()),
+          cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
+          reductionDim, reductionDimSize, op.getLoc(), rewriter);
+      // print the reduction op for debugging
+      LLVM_DEBUG({
+        llvm::dbgs() << "reductionOp3: " << *op << "\n";
+        llvm::dbgs() << "lowered reduction result3: " << result << "\n";
+      });
+    }
+
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
@@ -511,11 +537,31 @@ struct LowerVectorMultiReductionPattern
         reductionDims.size() == 1 &&
         "Expecting single reduction dimension for subgroup multi reduction op");
 
-    // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
-    Value result = xegpu::lowerToVectorReductions(
-        cast<TypedValue<VectorType>>(op.getSource()),
-        cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
-        reductionDims[0], op.getLoc(), rewriter);
+    // // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
+    // Value result = xegpu::lowerToVectorReductions(
+    //     cast<TypedValue<VectorType>>(op.getSource()),
+    //     cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
+    //     reductionDims[0], op.getLoc(), rewriter);
+
+    // For non-lane-local case, we simply rewrite the MultiReductionOp in
+    // terms of multiple ReductionOps. Actual distribution is done by the
+    // WarpOpReduction pattern.
+    // rewriter.setInsertionPointAfter(reductionOp);
+
+    // print adaptor.getSource() and adaptor.getAcc() for debugging
+    LLVM_DEBUG({
+      llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
+      llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
+    });
+    Value result = xegpu::lowerToVectorReductionsCrossLane(
+        cast<TypedValue<VectorType>>(adaptor.getSource()),
+        cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
+        reductionDims[0], /*reductionDimSize=*/1, op.getLoc(), rewriter);
+    // print the reduction op for debugging
+    LLVM_DEBUG({
+      llvm::dbgs() << "reductionOp3: " << *op << "\n";
+      llvm::dbgs() << "lowered reduction result3: " << result << "\n";
+    });
 
     rewriter.replaceOp(op, result);
     return success();
@@ -725,7 +771,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         if (!isValidSubgroupMultiReductionOp(op))
           return true;
         // Lane local reductions are illegal at this point and must be lowered.
-        return !isReductionLaneLocal(op);
+        return false; // !isReductionLaneLocal(op);
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..abf3ddcc3d373 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,12 +2,6 @@
 // RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
 // RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
 
-// RUN: mlir-opt --allow-unregistered-dialect \
-// RUN: --test-xegpu-sg-to-wi-distribute-experimental="enable-rewrite-multi-reduction-to-reductions"  \
-// RUN: --split-input-file  %s | FileCheck --check-prefix=CHECK-REWRITE %s
-
-
-
 gpu.module @xevm_module {
 // CHECK-LABEL: gpu.func @create_nd_tdesc
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -181,43 +175,56 @@ gpu.func @vector_reduction() {
   gpu.return
 }
 
-
-// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK-REWRITE-DAG:     %[[SRC:.*]] = "some_def"() {layout_result_0 =
-// CHECK-REWRITE-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
-// CHECK-REWRITE-DAG:     %[[ACC:.*]] = arith.constant
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE-DAG:     %[[ZERO:.*]] = arith.constant
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE:         %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME:       offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
-// CHECK-REWRITE-NEXT:    %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
-// CHECK-REWRITE-SAME:      {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      : vector<1x16xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT:    %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT:    %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      : f32 into vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME:       offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
-// CHECK-REWRITE-NEXT:    %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
-// CHECK-REWRITE-SAME:      {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      : vector<1x16xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT:    %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT:    %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME:      : f32 into vector<2xf32>
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
+// CHECK: %c16_i32 = arith.constant 16 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c16_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c16_i32_2 = arith.constant 16 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c16_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %c16_i32_5 = arith.constant 16 : i32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle  xor %5, %c4_i32, %c16_i32_5 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
+// CHECK: %c16_i32_8 = arith.constant 16 : i32
+// CHECK: %c8_i32 = arith.constant 8 : i32
+// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle  xor %6, %c8_i32, %c16_i32_8 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
+// CHECK: %8 = arith.addf %7, %2 : f32
+// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
+// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
+// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
+// CHECK: %c16_i32_11 = arith.constant 16 : i32
+// CHECK: %c1_i32_12 = arith.constant 1 : i32
+// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle  xor %13, %c1_i32_12, %c16_i32_11 : f32
+// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
+// CHECK: %c16_i32_15 = arith.constant 16 : i32
+// CHECK: %c2_i32_16 = arith.constant 2 : i32
+// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle  xor %14, %c2_i32_16, %c16_i32_15 : f32
+// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
+// CHECK: %c16_i32_19 = arith.constant 16 : i32
+// CHECK: %c4_i32_20 = arith.constant 4 : i32
+// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle  xor %15, %c4_i32_20, %c16_i32_19 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
+// CHECK: %c16_i32_23 = arith.constant 16 : i32
+// CHECK: %c8_i32_24 = arith.constant 8 : i32
+// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle  xor %16, %c8_i32_24, %c16_i32_23 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
+// CHECK: %18 = arith.addf %17, %12 : f32
+// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-    %src = "some_def"()
+  %src = arith.constant
       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<2x16xf32>)
+      dense<0.0>  : vector<2x16xf32>
     %acc = arith.constant
       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
       dense<0.0>  : vector<2xf32>
@@ -229,42 +236,63 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
-// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK-REWRITE-DAG:     %[[SRC:.*]] = "some_def"() {layout_result_0 =
-// CHECK-REWRITE-SAME:      #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
-// CHECK-REWRITE-DAG:     %[[ACC:.*]] = arith.constant
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME:      dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE-DAG:     %[[ZERO:.*]] = arith.constant
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME:      dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE:         %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME:       offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REWRITE-NEXT:    %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
-// CHECK-REWRITE-SAME:      {{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME:      : vector<16x1xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT:    %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT:    %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME:      : f32 into vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME:       offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REWRITE-NEXT:    %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
-// CHECK-REWRITE-SAME:      {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
-// CHECK-REWRITE-SAME:      : vector<16x1xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT:    %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT:    %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT:    %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
-// CHECK-REWRITE-SAME:      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME:      : f32 into vector<2xf32>
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x1xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<1xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c4_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c4_i32_2 = arith.constant 4 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c4_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %6 = arith.addf %5, %2 : f32
+// CHECK: %7 = vector.insert %6, %cst_1 [0] : f32 into vector<1xf32>
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %src = arith.constant
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      dense<0.0>  : vector<1x4xf32>
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<1xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<1x4xf32> to vector<1xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
+// CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %5, %c1_i32, %c2_i32 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult : f32
+// CHECK: %7 = arith.addf %6, %4 : f32
+// CHECK: %8 = vector.insert %7, %cst_0 [0] : f32 into vector<2xf32>
+// CHECK: %9 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %10 = vector.shape_cast %9 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %11 = vector.extract %cst[1] : f32 from vector<2xf32>
+// CHECK: %12 = vector.reduction <add>, %10 : vector<1xf32> into f32
+// CHECK: %c2_i32_1 = arith.constant 2 : i32
+// CHECK: %c1_i32_2 = arith.constant 1 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %12, %c1_i32_2, %c2_i32_1 : f32
+// CHECK: %13 = arith.addf %12, %shuffleResult_3 : f32
+// CHECK: %14 = arith.addf %13, %11 : f32
+// CHECK: %15 = vector.insert %14, %8 [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
     %src = "some_def"()
       {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      : () -> (vector<16x2xf32>)
+      : () -> (vector<2x2xf32>)
     %acc = arith.constant
       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
       dense<0.0>  : vector<2xf32>
@@ -272,7 +300,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
       {
         layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
       }
-      [0] : vector<16x2xf32> to vector<2xf32>
+      [0] : vector<2x2xf32> to vector<2xf32>
   gpu.return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 1cea44af57459..f8d71350b7c52 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,44 +329,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size
-// CHECK:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK:   %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK:   %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
-// CHECK:   %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[T2]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
-// CHECK:   %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
-// CHECK:   %[[FINAL:.*]] = arith.addf %[[ADD2]], %[[T1]] : f32
-// CHECK:   %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK:   %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
-// CHECK:   %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle  xor %[[T9]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
-// CHECK:   %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
-// CHECK:   %[[FINAL2:.*]] = arith.addf %[[ADD6]], %[[T8]] : f32
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<16x4xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<2xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
-      }
-      [1] : vector<16x4xf32> to vector<2xf32>
-    gpu.yield %1 : vector<2xf32>
-  }
-  "some_user_op"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
 // CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
 // CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
 // CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {

>From 6b5ebccda034291b46260ac664dcf2095e872ebf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 05:05:49 +0000
Subject: [PATCH 07/14] remove XeGPUSgToWiLowerVectorMultiReduction pattern

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |  8 ---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 72 -------------------
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 18 -----
 3 files changed, 98 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 6f6d58d4ab605..6afd9c9d09369 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -82,14 +82,6 @@ void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
 void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target);
-/// Appends patterns to rewrite vector::MultiDimReductionOp in terms of
-/// vector::ReductionOps if the multi-reduction involves cross-lane data
-/// movement. This pattern is used as pre-processing step before applying
-/// subgroup to workitem distribution patterns. This pattern will rewrite a
-/// multi reduction in terms of a series of simpler extract, reduction and
-/// insert ops if the reduction require cross-lane data movement.
-void populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
-    RewritePatternSet &patterns, ConversionTarget &target);
 
 /// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
 /// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 4237507c7f016..b73459b0587b1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -515,59 +515,6 @@ struct SgToWiMultiDimReduction
   }
 };
 
-/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
-/// of vector.extract_strided_slice, vector.reduction and
-/// vector.insert_strided_slice ops. This is used when the reduction dimension
-/// is distributed to lanes and a naive (lane-local) distribution is not
-/// possible. Then later on, these partially lowered subgroup-level ops are
-/// further lowered to workitem-level by respective patterns.
-struct LowerVectorMultiReductionPattern
-    : public OpConversionPattern<vector::MultiDimReductionOp> {
-  using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // Only non-lane-local reduction is handled here.
-    if (isReductionLaneLocal(op))
-      return rewriter.notifyMatchFailure(
-          op, "Reduction is lane-local, it does not require rewrite.");
-    ArrayRef<int64_t> reductionDims = op.getReductionDims();
-    assert(
-        reductionDims.size() == 1 &&
-        "Expecting single reduction dimension for subgroup multi reduction op");
-
-    // // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
-    // Value result = xegpu::lowerToVectorReductions(
-    //     cast<TypedValue<VectorType>>(op.getSource()),
-    //     cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
-    //     reductionDims[0], op.getLoc(), rewriter);
-
-    // For non-lane-local case, we simply rewrite the MultiReductionOp in
-    // terms of multiple ReductionOps. Actual distribution is done by the
-    // WarpOpReduction pattern.
-    // rewriter.setInsertionPointAfter(reductionOp);
-
-    // print adaptor.getSource() and adaptor.getAcc() for debugging
-    LLVM_DEBUG({
-      llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
-      llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
-    });
-    Value result = xegpu::lowerToVectorReductionsCrossLane(
-        cast<TypedValue<VectorType>>(adaptor.getSource()),
-        cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
-        reductionDims[0], /*reductionDimSize=*/1, op.getLoc(), rewriter);
-    // print the reduction op for debugging
-    LLVM_DEBUG({
-      llvm::dbgs() << "reductionOp3: " << *op << "\n";
-      llvm::dbgs() << "lowered reduction result3: " << result << "\n";
-    });
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -779,22 +726,3 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
                SgToWiVectorReduction, SgToWiMultiDimReduction>(
       typeConverter, patterns.getContext());
 }
-
-void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
-    RewritePatternSet &patterns, ConversionTarget &target) {
-  // vector::MultiDimReductionOp legality.
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [&](vector::MultiDimReductionOp op) {
-        // Check common conditions for subgroup multi reduction op.
-        if (!isValidSubgroupMultiReductionOp(op))
-          return true;
-        // Lane local reductions are legal. We only rewrite non-lane-local
-        // reductions.
-        return isReductionLaneLocal(op);
-      });
-  // vector::ReductionOp is legal.
-  target.addDynamicallyLegalOp<vector::ReductionOp>(
-      [&](vector::ReductionOp op) { return true; });
-  target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-  patterns.add<LowerVectorMultiReductionPattern>(patterns.getContext());
-}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 33af2c5b33d89..4192ac46764dd 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -273,12 +273,6 @@ struct TestXeGPUSgToWiDistributeExperimental
            "Work-item Distribution";
   }
 
-  Option<bool> enableRewriteMultiReductionToReductions{
-      *this, "enable-rewrite-multi-reduction-to-reductions",
-      llvm::cl::desc("Partially lower multi-reduction ops to reduction ops if "
-                     "the reduction dimension is distributed."),
-      llvm::cl::init(false)};
-
   void getDependentDialects(::mlir::DialectRegistry &registry) const override {
     registry.insert<arith::ArithDialect>();
     registry.insert<memref::MemRefDialect>();
@@ -306,18 +300,6 @@ struct TestXeGPUSgToWiDistributeExperimental
     typeConverter.addSourceMaterialization(materializeCast);
     typeConverter.addTargetMaterialization(materializeCast);
 
-    // If `enableRewriteMultiReductionToReductions` is set, only focus on
-    // testing the partial lowering of vector::MultiReductionOp.
-    if (enableRewriteMultiReductionToReductions) {
-      xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
-      ConversionTarget target(*ctx);
-      RewritePatternSet patterns(ctx);
-      xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(patterns,
-                                                                     target);
-      (void)applyPartialConversion(getOperation(), target, std::move(patterns));
-      return;
-    }
-
     ConversionTarget target(*ctx);
     RewritePatternSet patterns(ctx);
     xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(

>From 1b9165e61b0fad8dc9a7d325a6cb1c133ac30a49 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 05:14:15 +0000
Subject: [PATCH 08/14] remove the multi-reduction pattern change in old warp
 op based distribution mechanism

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 77 +++++--------------
 .../XeGPU/subgroup-distribute-unit.mlir       | 76 ++++++++----------
 2 files changed, 49 insertions(+), 104 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d82766f61338a..99c2da386fab6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1337,21 +1337,6 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
 
-    // before get distribute vec type for source, first set its shape to be unit
-    // for the reduction dimension
-    SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
-                                        sourceType.getShape().end());
-    int64_t reductionDimSize = sourceShape[reductionDim];
-    // sourceShape[reductionDim] = 1;
-    // VectorType modifiedSourceType =
-    //     VectorType::get(sourceShape, sourceType.getElementType());
-
-    // print out modifiedSourceType and sourceLayout for debugging
-    //  LLVM_DEBUG({
-    //    llvm::dbgs() << "modifiedSourceType: " << modifiedSourceType << "\n";
-    //    llvm::dbgs() << "sourceLayout: " << sourceLayout << "\n";
-    //  });
-
     FailureOr<VectorType> sourceDistTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
     if (failed(sourceDistTypeOrFailure))
@@ -1387,17 +1372,6 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
 
     bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
                                 (sourceDistDim == 1 && reductionDim == 0);
-    // print here all these five variables for debugging
-    LLVM_DEBUG({
-      llvm::dbgs() << "sourceDistDim: " << sourceDistDim << "\n";
-      llvm::dbgs() << "reductionDim: " << reductionDim << "\n";
-      llvm::dbgs() << "isReductionLaneLocal: " << isReductionLaneLocal << "\n";
-      llvm::dbgs() << "resultDistributed: " << resultDistributed << "\n";
-      llvm::dbgs() << "sourceDistType: " << sourceDistType << "\n";
-      llvm::dbgs() << "distributedResultType: " << distributedResultType
-                   << "\n";
-    });
-
     if (isReductionLaneLocal && !resultDistributed)
       return rewriter.notifyMatchFailure(
           warpOp, "Expecting a distributed result for lane-local reduction.");
@@ -1407,46 +1381,33 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
           warpOp,
           "Expecting a broadcasted result for non-lane-local reduction.");
 
-    // Yield the source and acc vectors from the WarpOp.
-    SmallVector<size_t> newRetIndices;
-    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
-        {sourceDistType, distributedResultType}, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    Value result;
     // Handle lane-local reduction case. In this case we fully distribute the
     // reduction result.
     if (isReductionLaneLocal) {
-
-      result = xegpu::lowerToVectorReductions(
+      // Yield the source and acc vectors from the WarpOp.
+      SmallVector<size_t> newRetIndices;
+      auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+          rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
+          {sourceDistType, distributedResultType}, newRetIndices);
+      rewriter.setInsertionPointAfter(newWarpOp);
+      Value result = xegpu::lowerToVectorReductions(
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
           reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
-      // print the reduction op for debugging
-      LLVM_DEBUG({
-        llvm::dbgs() << "reductionOp1: " << *reductionOp << "\n";
-        llvm::dbgs() << "lowered reduction result1: " << result << "\n";
-      });
-
-    } else {
-      // For non-lane-local case, we simply rewrite the MultiReductionOp in
-      // terms of multiple ReductionOps. Actual distribution is done by the
-      // WarpOpReduction pattern.
-      // rewriter.setInsertionPointAfter(reductionOp);
-      result = xegpu::lowerToVectorReductionsCrossLane(
-          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
-          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
-          reductionOp.getKind(), reductionDim, reductionDimSize,
-          reductionOp.getLoc(), rewriter);
-      // print the reduction op for debugging
-      LLVM_DEBUG({
-        llvm::dbgs() << "reductionOp2: " << *reductionOp << "\n";
-        llvm::dbgs() << "lowered reduction result2: " << result << "\n";
-      });
+      // Replace the warp op result with the final result.
+      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
+      return success();
     }
+    // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
+    // of multiple ReductionOps. Actual distribution is done by the
+    // WarpOpReduction pattern.
+    rewriter.setInsertionPointAfter(reductionOp);
+    Value result = xegpu::lowerToVectorReductions(
+        cast<TypedValue<VectorType>>(reductionOp.getSource()),
+        cast<TypedValue<VectorType>>(reductionOp.getAcc()),
+        reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
     // Replace the warp op result with the final result.
-    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
+    rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
     return success();
   }
 };
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index f8d71350b7c52..fb23f38b44b46 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -283,30 +283,14 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
   gpu.return
 }
 
+
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK:   %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK:   %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
-// CHECK:   %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[T2]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
-// CHECK:   %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
-// CHECK:   %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
-// CHECK:   %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
-// CHECK:   %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[T1]] : f32
-// CHECK:   %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK:   %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
-// CHECK:   %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle  xor %[[T9]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
-// CHECK:   %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
-// CHECK:   %[[SHUFFLE7:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUFFLE7]] : f32
-// CHECK:   %[[SHUFFLE8:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
-// CHECK:   %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUFFLE8]] : f32
-// CHECK:   %[[FINAL2:.*]] = arith.addf %[[ADD8]], %[[T8]] : f32
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
+// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -329,6 +313,8 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
+
+
 // CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
 // CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
 // CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
@@ -366,29 +352,27 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
 
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK:       %[[V0:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK:       %[[V1:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<1x2xf32>
-// CHECK:       %[[SHUF1:.*]], %[[VALID1:.*]] = gpu.shuffle  xor %[[V1]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD1:.*]] = arith.addf %[[V1]], %[[SHUF1]] : f32
-// CHECK:       %[[SHUF2:.*]], %[[VALID2:.*]] = gpu.shuffle  xor %[[ADD1]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUF2]] : f32
-// CHECK:       %[[SHUF3:.*]], %[[VALID3:.*]] = gpu.shuffle  xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUF3]] : f32
-// CHECK:       %[[SHUF4:.*]], %[[VALID4:.*]] = gpu.shuffle  xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUF4]] : f32
-// CHECK:       %[[RES0:.*]] = arith.addf %[[ADD4]], %[[V0]] : f32
-// CHECK:       %[[V2:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK:       %[[V3:.*]] = vector.extract %{{.*}}#1[0, 1] : f32 from vector<1x2xf32>
-// CHECK:       %[[SHUF5:.*]], %[[VALID5:.*]] = gpu.shuffle  xor %[[V3]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD5:.*]] = arith.addf %[[V3]], %[[SHUF5]] : f32
-// CHECK:       %[[SHUF6:.*]], %[[VALID6:.*]] = gpu.shuffle  xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUF6]] : f32
-// CHECK:       %[[SHUF7:.*]], %[[VALID7:.*]] = gpu.shuffle  xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUF7]] : f32
-// CHECK:       %[[SHUF8:.*]], %[[VALID8:.*]] = gpu.shuffle  xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
-// CHECK:       %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUF8]] : f32
-// CHECK:       %[[RES1:.*]] = arith.addf %[[ADD8]], %[[V2]] : f32
-// CHECK:       %[[RESULT:.*]] = vector.from_elements %[[RES0]], %[[RES1]] : vector<2xf32>
+// CHECK:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:       %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:       %[[SRC:.*]] = "some_def"()
+// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:    : () -> vector<16x2xf32>
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME:     offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]]
+// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME:     offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]]
+// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {

>From 6f9e5b5a429b228c3492fdd0a586120157d86bfe Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 18:39:31 +0000
Subject: [PATCH 09/14] polish and remove partial-sg-size reduction test

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     | 12 +--
 .../XeGPUSgToWiDistributeExperimental.cpp     | 11 +--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  2 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 87 +++++++++----------
 4 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 91b7c2202e56b..f2cbb198b2dc2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,12 +147,12 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
                               vector::CombiningKind kind, int64_t reductionDim,
                               Location loc, PatternRewriter &rewriter);
 
-Value lowerToVectorReductionsCrossLane(TypedValue<VectorType> src,
-                                       TypedValue<VectorType> acc,
-                                       vector::CombiningKind kind,
-                                       int64_t reductionDim,
-                                       int64_t reductionSize, Location loc,
-                                       PatternRewriter &rewriter);
+Value lowerCrossLaneReductionToShuffles(TypedValue<VectorType> src,
+                                        TypedValue<VectorType> acc,
+                                        vector::CombiningKind kind,
+                                        int64_t reductionDim,
+                                        int64_t reductionSize, Location loc,
+                                        PatternRewriter &rewriter);
 
 /// Helper Function to find a proper instruction multiple for the user-supplied
 /// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b73459b0587b1..d0dc2e8df927e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -470,6 +470,10 @@ struct SgToWiMultiDimReduction
   matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Value result;
+    ArrayRef<int64_t> reductionDims = op.getReductionDims();
+    assert(reductionDims.size() == 1 &&
+           "Expecting single reduction dimension for subgroup multi "
+           "reduction op");
     // Only lane-local reduction is handled here.
     if (isReductionLaneLocal(op)) {
       auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
@@ -482,10 +486,7 @@ struct SgToWiMultiDimReduction
           rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
           adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
     } else {
-      ArrayRef<int64_t> reductionDims = op.getReductionDims();
-      assert(reductionDims.size() == 1 &&
-             "Expecting single reduction dimension for subgroup multi "
-             "reduction op");
+
       // print adaptor.getSource() and adaptor.getAcc() for debugging
       LLVM_DEBUG({
         llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
@@ -499,7 +500,7 @@ struct SgToWiMultiDimReduction
       SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
                                           sourceType.getShape().end());
       int64_t reductionDimSize = sourceShape[reductionDim];
-      result = xegpu::lowerToVectorReductionsCrossLane(
+      result = xegpu::lowerCrossLaneReductionToShuffles(
           cast<TypedValue<VectorType>>(adaptor.getSource()),
           cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
           reductionDim, reductionDimSize, op.getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e0a1de1af45aa..a0986608bdba0 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -750,7 +750,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
   return reductionResult;
 }
 
-Value xegpu::lowerToVectorReductionsCrossLane(
+Value xegpu::lowerCrossLaneReductionToShuffles(
     TypedValue<VectorType> src, TypedValue<VectorType> acc,
     vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
     Location loc, PatternRewriter &rewriter) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index abf3ddcc3d373..029cca419fa28 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -236,63 +236,56 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x1xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<1xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c4_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c4_i32_2 = arith.constant 4 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c4_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %6 = arith.addf %5, %2 : f32
-// CHECK: %7 = vector.insert %6, %cst_1 [0] : f32 into vector<1xf32>
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %src = arith.constant
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      dense<0.0>  : vector<1x4xf32>
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<1xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
-      }
-      [1] : vector<1x4xf32> to vector<1xf32>
-  gpu.return
-}
-
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
 // CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
 // CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
 // CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
 // CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
-// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %c16_i32 = arith.constant 16 : i32
 // CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %5, %c1_i32, %c2_i32 : f32
+// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %5, %c1_i32, %c16_i32 : f32
 // CHECK: %6 = arith.addf %5, %shuffleResult : f32
-// CHECK: %7 = arith.addf %6, %4 : f32
-// CHECK: %8 = vector.insert %7, %cst_0 [0] : f32 into vector<2xf32>
-// CHECK: %9 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %10 = vector.shape_cast %9 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %11 = vector.extract %cst[1] : f32 from vector<2xf32>
-// CHECK: %12 = vector.reduction <add>, %10 : vector<1xf32> into f32
-// CHECK: %c2_i32_1 = arith.constant 2 : i32
-// CHECK: %c1_i32_2 = arith.constant 1 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %12, %c1_i32_2, %c2_i32_1 : f32
-// CHECK: %13 = arith.addf %12, %shuffleResult_3 : f32
-// CHECK: %14 = arith.addf %13, %11 : f32
-// CHECK: %15 = vector.insert %14, %8 [1] : f32 into vector<2xf32>
+// CHECK: %c16_i32_1 = arith.constant 16 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_2, %valid_3 = gpu.shuffle  xor %6, %c2_i32, %c16_i32_1 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_2 : f32
+// CHECK: %c16_i32_4 = arith.constant 16 : i32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %shuffleResult_5, %valid_6 = gpu.shuffle  xor %7, %c4_i32, %c16_i32_4 : f32
+// CHECK: %8 = arith.addf %7, %shuffleResult_5 : f32
+// CHECK: %c16_i32_7 = arith.constant 16 : i32
+// CHECK: %c8_i32 = arith.constant 8 : i32
+// CHECK: %shuffleResult_8, %valid_9 = gpu.shuffle  xor %8, %c8_i32, %c16_i32_7 : f32
+// CHECK: %9 = arith.addf %8, %shuffleResult_8 : f32
+// CHECK: %10 = arith.addf %9, %4 : f32
+// CHECK: %11 = vector.insert %10, %cst_0 [0] : f32 into vector<2xf32>
+// CHECK: %12 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %13 = vector.shape_cast %12 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %14 = vector.extract %cst[1] : f32 from vector<2xf32>
+// CHECK: %15 = vector.reduction <add>, %13 : vector<1xf32> into f32
+// CHECK: %c16_i32_10 = arith.constant 16 : i32
+// CHECK: %c1_i32_11 = arith.constant 1 : i32
+// CHECK: %shuffleResult_12, %valid_13 = gpu.shuffle  xor %15, %c1_i32_11, %c16_i32_10 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_12 : f32
+// CHECK: %c16_i32_14 = arith.constant 16 : i32
+// CHECK: %c2_i32_15 = arith.constant 2 : i32
+// CHECK: %shuffleResult_16, %valid_17 = gpu.shuffle  xor %16, %c2_i32_15, %c16_i32_14 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_16 : f32
+// CHECK: %c16_i32_18 = arith.constant 16 : i32
+// CHECK: %c4_i32_19 = arith.constant 4 : i32
+// CHECK: %shuffleResult_20, %valid_21 = gpu.shuffle  xor %17, %c4_i32_19, %c16_i32_18 : f32
+// CHECK: %18 = arith.addf %17, %shuffleResult_20 : f32
+// CHECK: %c16_i32_22 = arith.constant 16 : i32
+// CHECK: %c8_i32_23 = arith.constant 8 : i32
+// CHECK: %shuffleResult_24, %valid_25 = gpu.shuffle  xor %18, %c8_i32_23, %c16_i32_22 : f32
+// CHECK: %19 = arith.addf %18, %shuffleResult_24 : f32
+// CHECK: %20 = arith.addf %19, %14 : f32
+// CHECK: %21 = vector.insert %20, %11 [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
     %src = "some_def"()
       {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      : () -> (vector<2x2xf32>)
+      : () -> (vector<16x2xf32>)
     %acc = arith.constant
       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
       dense<0.0>  : vector<2xf32>
@@ -300,7 +293,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
       {
         layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
       }
-      [0] : vector<2x2xf32> to vector<2xf32>
+      [0] : vector<16x2xf32> to vector<2xf32>
   gpu.return
 }
 

>From 4853acf670b097f6be685036597a3a624307376b Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 19:02:56 +0000
Subject: [PATCH 10/14] polish

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 20 +----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 51 +----------
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 90 ++++++++++---------
 3 files changed, 54 insertions(+), 107 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index d0dc2e8df927e..32c8089c7226a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -474,27 +474,17 @@ struct SgToWiMultiDimReduction
     assert(reductionDims.size() == 1 &&
            "Expecting single reduction dimension for subgroup multi "
            "reduction op");
-    // Only lane-local reduction is handled here.
     if (isReductionLaneLocal(op)) {
       auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
       VectorType resVecTy = dyn_cast<VectorType>(op.getType());
       auto resDistVecTyOrFailure =
           getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
-      // Simply create a new MultiDimReductionOp using adaptor operands and the
-      // new result type.
+      // For lane local reduction, simply create a new MultiDimReductionOp using
+      // adaptor operands and the new result type.
       result = vector::MultiDimReductionOp::create(
           rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
           adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
     } else {
-
-      // print adaptor.getSource() and adaptor.getAcc() for debugging
-      LLVM_DEBUG({
-        llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
-        llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
-      });
-      // before get distribute vec type for source, first set its shape to be
-      // unit
-      // for the reduction dimension
       auto reductionDim = reductionDims[0];
       VectorType sourceType = op.getSourceVectorType();
       SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
@@ -504,13 +494,7 @@ struct SgToWiMultiDimReduction
           cast<TypedValue<VectorType>>(adaptor.getSource()),
           cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
           reductionDim, reductionDimSize, op.getLoc(), rewriter);
-      // print the reduction op for debugging
-      LLVM_DEBUG({
-        llvm::dbgs() << "reductionOp3: " << *op << "\n";
-        llvm::dbgs() << "lowered reduction result3: " << result << "\n";
-      });
     }
-
     rewriter.replaceOp(op, result);
     return success();
   }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a0986608bdba0..88e2c1a879fdd 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -121,36 +121,13 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
   // dimensions are not distributed.
   unsigned distributionStart =
       originalType.getRank() - effectiveLaneLayout.size();
-
-  // Print original shape and lane layout for debugging
-  std::string shapeStr = "[";
-  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i > 0)
-      shapeStr += ", ";
-    shapeStr += std::to_string(dim);
-  }
-  shapeStr += "]";
-  LDBG() << "original shape: " << shapeStr;
-
-  std::string layoutStr = "[";
-  for (auto [i, dim] : llvm::enumerate(effectiveLaneLayout)) {
-    if (i > 0)
-      layoutStr += ", ";
-    layoutStr += std::to_string(dim);
-  }
-  layoutStr += "]";
-  LDBG() << "effective lane layout: " << layoutStr;
-
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
     // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0) {
-      assert( effectiveLaneLayout[i - distributionStart] % dim == 0 &&
-              "The dimension size must be able evenly distributed to all lanes in round-robin manner.");
-      distributedShape[i] = 1;
-    } else
-      distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
   }
   return VectorType::get(distributedShape, originalType.getElementType());
 }
@@ -765,22 +742,6 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
-  // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
-  // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
-  // // Reduction result should have the same layout as the accumulator.
-  // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
-
-  // print source shape, reduction dim and reduction size for debugging
-  std::string shapeStr = "[";
-  for (auto [i, dim] : llvm::enumerate(sourceType.getShape())) {
-    if (i > 0)
-      shapeStr += ", ";
-    shapeStr += std::to_string(dim);
-  }
-  shapeStr += "]";
-  LDBG() << "source shape: " << shapeStr;
-  LDBG() << "reduction dim: " << reductionDim;
-  LDBG() << "reduction size: " << reductionSize;
 
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
@@ -794,12 +755,6 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
       sliceSizes = {sourceH, 1};
     }
 
-    // print src, sliceOffsets, sliceSizes for debugging
-    LDBG() << "src: " << src;
-    LDBG() << "sliceOffsets: [" << sliceOffsets[0] << ", " << sliceOffsets[1]
-           << "]";
-    LDBG() << "sliceSizes: [" << sliceSizes[0] << ", " << sliceSizes[1] << "]";
-
     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 029cca419fa28..a87a67467d7fc 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -176,6 +176,10 @@ gpu.func @vector_reduction() {
 }
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK: %c0 = arith.constant 0 : index
+// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<2x1xf32>
+// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
 // CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
 // CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
 // CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
@@ -237,55 +241,59 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 }
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
-// CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
+// CHECK: %c0 = arith.constant 0 : index
+// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<1x2xf32>
+// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
 // CHECK: %c16_i32 = arith.constant 16 : i32
 // CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %5, %c1_i32, %c16_i32 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult : f32
-// CHECK: %c16_i32_1 = arith.constant 16 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c16_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c16_i32_2 = arith.constant 16 : i32
 // CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_2, %valid_3 = gpu.shuffle  xor %6, %c2_i32, %c16_i32_1 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_2 : f32
-// CHECK: %c16_i32_4 = arith.constant 16 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c16_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %c16_i32_5 = arith.constant 16 : i32
 // CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_5, %valid_6 = gpu.shuffle  xor %7, %c4_i32, %c16_i32_4 : f32
-// CHECK: %8 = arith.addf %7, %shuffleResult_5 : f32
-// CHECK: %c16_i32_7 = arith.constant 16 : i32
+// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle  xor %5, %c4_i32, %c16_i32_5 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
+// CHECK: %c16_i32_8 = arith.constant 16 : i32
 // CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_8, %valid_9 = gpu.shuffle  xor %8, %c8_i32, %c16_i32_7 : f32
-// CHECK: %9 = arith.addf %8, %shuffleResult_8 : f32
-// CHECK: %10 = arith.addf %9, %4 : f32
-// CHECK: %11 = vector.insert %10, %cst_0 [0] : f32 into vector<2xf32>
-// CHECK: %12 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %13 = vector.shape_cast %12 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %14 = vector.extract %cst[1] : f32 from vector<2xf32>
-// CHECK: %15 = vector.reduction <add>, %13 : vector<1xf32> into f32
-// CHECK: %c16_i32_10 = arith.constant 16 : i32
-// CHECK: %c1_i32_11 = arith.constant 1 : i32
-// CHECK: %shuffleResult_12, %valid_13 = gpu.shuffle  xor %15, %c1_i32_11, %c16_i32_10 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_12 : f32
-// CHECK: %c16_i32_14 = arith.constant 16 : i32
-// CHECK: %c2_i32_15 = arith.constant 2 : i32
-// CHECK: %shuffleResult_16, %valid_17 = gpu.shuffle  xor %16, %c2_i32_15, %c16_i32_14 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_16 : f32
-// CHECK: %c16_i32_18 = arith.constant 16 : i32
-// CHECK: %c4_i32_19 = arith.constant 4 : i32
-// CHECK: %shuffleResult_20, %valid_21 = gpu.shuffle  xor %17, %c4_i32_19, %c16_i32_18 : f32
-// CHECK: %18 = arith.addf %17, %shuffleResult_20 : f32
-// CHECK: %c16_i32_22 = arith.constant 16 : i32
-// CHECK: %c8_i32_23 = arith.constant 8 : i32
-// CHECK: %shuffleResult_24, %valid_25 = gpu.shuffle  xor %18, %c8_i32_23, %c16_i32_22 : f32
-// CHECK: %19 = arith.addf %18, %shuffleResult_24 : f32
-// CHECK: %20 = arith.addf %19, %14 : f32
-// CHECK: %21 = vector.insert %20, %11 [1] : f32 into vector<2xf32>
+// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle  xor %6, %c8_i32, %c16_i32_8 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
+// CHECK: %8 = arith.addf %7, %2 : f32
+// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
+// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
+// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
+// CHECK: %c16_i32_11 = arith.constant 16 : i32
+// CHECK: %c1_i32_12 = arith.constant 1 : i32
+// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle  xor %13, %c1_i32_12, %c16_i32_11 : f32
+// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
+// CHECK: %c16_i32_15 = arith.constant 16 : i32
+// CHECK: %c2_i32_16 = arith.constant 2 : i32
+// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle  xor %14, %c2_i32_16, %c16_i32_15 : f32
+// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
+// CHECK: %c16_i32_19 = arith.constant 16 : i32
+// CHECK: %c4_i32_20 = arith.constant 4 : i32
+// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle  xor %15, %c4_i32_20, %c16_i32_19 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
+// CHECK: %c16_i32_23 = arith.constant 16 : i32
+// CHECK: %c8_i32_24 = arith.constant 8 : i32
+// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle  xor %16, %c8_i32_24, %c16_i32_23 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
+// CHECK: %18 = arith.addf %17, %12 : f32
+// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
-    %src = "some_def"()
+    %src = arith.constant
       {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      : () -> (vector<16x2xf32>)
+      dense<0.0> : vector<16x2xf32>
     %acc = arith.constant
       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
       dense<0.0>  : vector<2xf32>

>From 0a2337075ebe5adc29fd43386a44c1aacffa0d85 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 19:24:19 +0000
Subject: [PATCH 11/14] polish legality check

---
 .../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp  | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 32c8089c7226a..7ecc1e3128d02 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -699,11 +699,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
   // vector::MultiDimReductionOp op legality.
   target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
       [=](vector::MultiDimReductionOp op) -> bool {
-        // Check common conditions for subgroup multi reduction op.
-        if (!isValidSubgroupMultiReductionOp(op))
-          return true;
-        // Lane local reductions are illegal at this point and must be lowered.
-        return false; // !isReductionLaneLocal(op);
+        return !isValidSubgroupMultiReductionOp(op);
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,

>From 5f35d52413950d4dec7290c77b4c906d04ff10b9 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:16:49 +0000
Subject: [PATCH 12/14] address feedback and fix test issue

---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   8 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 192 +++++++++---------
 2 files changed, 103 insertions(+), 97 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 88e2c1a879fdd..63af57335595d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -736,13 +736,19 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
   VectorType sourceType = src.getType();
   int64_t sourceH = sourceType.getShape()[0];
   int64_t sourceW = sourceType.getShape()[1];
-  int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+
   // Create a constant vector to hold the result of the reduction.
   TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
 
+  // nSlices is the number of reduction operations needed to reduce the entire
+  // source vector. For example, if reductionDim is 0, we are reducing across
+  // rows, and each slice is a column of the source vector. So the number of
+  // slices is the number of columns, which is sourceW.
+  int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a87a67467d7fc..450aa2cf6df05 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -176,54 +176,54 @@ gpu.func @vector_reduction() {
 }
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK: %c0 = arith.constant 0 : index
-// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<2x1xf32>
-// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c16_i32 = arith.constant 16 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c16_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c16_i32_2 = arith.constant 16 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c16_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %c16_i32_5 = arith.constant 16 : i32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle  xor %5, %c4_i32, %c16_i32_5 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
-// CHECK: %c16_i32_8 = arith.constant 16 : i32
-// CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle  xor %6, %c8_i32, %c16_i32_8 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
-// CHECK: %8 = arith.addf %7, %2 : f32
-// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
-// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
-// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
-// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
-// CHECK: %c16_i32_11 = arith.constant 16 : i32
-// CHECK: %c1_i32_12 = arith.constant 1 : i32
-// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle  xor %13, %c1_i32_12, %c16_i32_11 : f32
-// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
-// CHECK: %c16_i32_15 = arith.constant 16 : i32
-// CHECK: %c2_i32_16 = arith.constant 2 : i32
-// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle  xor %14, %c2_i32_16, %c16_i32_15 : f32
-// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
-// CHECK: %c16_i32_19 = arith.constant 16 : i32
-// CHECK: %c4_i32_20 = arith.constant 4 : i32
-// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle  xor %15, %c4_i32_20, %c16_i32_19 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
-// CHECK: %c16_i32_23 = arith.constant 16 : i32
-// CHECK: %c8_i32_24 = arith.constant 8 : i32
-// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle  xor %16, %c8_i32_24, %c16_i32_23 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
-// CHECK: %18 = arith.addf %17, %12 : f32
-// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<2x1xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[V0:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %[[V1:.*]] = vector.shape_cast %[[V0]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V2:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<2xf32>
+// CHECK: %[[V3:.*]] = vector.reduction <add>, %[[V1]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle  xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle  xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle  xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
+// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
+// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %[[V11:.*]] = vector.shape_cast %[[V10]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V12:.*]] = vector.extract %[[CST_0]][1] : f32 from vector<2xf32>
+// CHECK: %[[V13:.*]] = vector.reduction <add>, %[[V11]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle  xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle  xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle  xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle  xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
+// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
   %src = arith.constant
@@ -241,54 +241,54 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 }
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %c0 = arith.constant 0 : index
-// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<1x2xf32>
-// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c16_i32 = arith.constant 16 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle  xor %3, %c1_i32, %c16_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c16_i32_2 = arith.constant 16 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle  xor %4, %c2_i32, %c16_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %c16_i32_5 = arith.constant 16 : i32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle  xor %5, %c4_i32, %c16_i32_5 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
-// CHECK: %c16_i32_8 = arith.constant 16 : i32
-// CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle  xor %6, %c8_i32, %c16_i32_8 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
-// CHECK: %8 = arith.addf %7, %2 : f32
-// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
-// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
-// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
-// CHECK: %c16_i32_11 = arith.constant 16 : i32
-// CHECK: %c1_i32_12 = arith.constant 1 : i32
-// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle  xor %13, %c1_i32_12, %c16_i32_11 : f32
-// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
-// CHECK: %c16_i32_15 = arith.constant 16 : i32
-// CHECK: %c2_i32_16 = arith.constant 2 : i32
-// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle  xor %14, %c2_i32_16, %c16_i32_15 : f32
-// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
-// CHECK: %c16_i32_19 = arith.constant 16 : i32
-// CHECK: %c4_i32_20 = arith.constant 4 : i32
-// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle  xor %15, %c4_i32_20, %c16_i32_19 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
-// CHECK: %c16_i32_23 = arith.constant 16 : i32
-// CHECK: %c8_i32_24 = arith.constant 8 : i32
-// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle  xor %16, %c8_i32_24, %c16_i32_23 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
-// CHECK: %18 = arith.addf %17, %12 : f32
-// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[V0:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %[[V1:.*]] = vector.shape_cast %[[V0]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V2:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<2xf32>
+// CHECK: %[[V3:.*]] = vector.reduction <add>, %[[V1]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle  xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle  xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle  xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
+// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
+// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %[[V11:.*]] = vector.shape_cast %[[V10]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V12:.*]] = vector.extract %[[CST_0]][1] : f32 from vector<2xf32>
+// CHECK: %[[V13:.*]] = vector.reduction <add>, %[[V11]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle  xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle  xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle  xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle  xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
+// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
     %src = arith.constant

>From eb81bcb1bc6cc9da34ccc0a5acf625ac70af692e Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:30:51 +0000
Subject: [PATCH 13/14] address feedback

---
 mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h         | 4 ++++
 .../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 4 +---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp                | 7 -------
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index f2cbb198b2dc2..e7cae506d9f4e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,6 +147,10 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
                               vector::CombiningKind kind, int64_t reductionDim,
                               Location loc, PatternRewriter &rewriter);
 
+/// Lowers cross-lane reductions to shuffle operations on a 2D vector.
+/// Extracts slices along the reduction dimension, performs subgroup reductions
+/// with shuffles across reductionSize work-items, and inserts the results back
+/// into an accumulator vector.
 Value lowerCrossLaneReductionToShuffles(TypedValue<VectorType> src,
                                         TypedValue<VectorType> acc,
                                         vector::CombiningKind kind,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 7ecc1e3128d02..32ead1867aa23 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -487,9 +487,7 @@ struct SgToWiMultiDimReduction
     } else {
       auto reductionDim = reductionDims[0];
       VectorType sourceType = op.getSourceVectorType();
-      SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
-                                          sourceType.getShape().end());
-      int64_t reductionDimSize = sourceShape[reductionDim];
+      int64_t reductionDimSize = sourceType.getShape()[reductionDim];
       result = xegpu::lowerCrossLaneReductionToShuffles(
           cast<TypedValue<VectorType>>(adaptor.getSource()),
           cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 63af57335595d..3271e73e0b571 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -764,24 +764,17 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
     vector::ExtractStridedSliceOp extractOp =
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
-
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
-
     vector::ShapeCastOp slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());
 
-    // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
-
-    // Distribute and reduce across work-items in the subgroup.
     Value fullReduce =
         xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
-
     fullReduce =
         vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
-
     reductionResult =
         vector::InsertOp::create(rewriter, loc, fullReduce, reductionResult, i);
   }

>From e67872e34b6a1dec6210d55f02f25b5c5089b87c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:50:37 +0000
Subject: [PATCH 14/14] fix test

---
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 450aa2cf6df05..2225cfb4021cb 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -154,19 +154,19 @@ gpu.func @prefetch_nd() {
 // CHECK-DAG: %[[C16_1:.*]] = arith.constant 16 : i32
 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
 // CHECK:     %[[SHUFFLE1:.*]], %{{.*}} = gpu.shuffle  xor %[[LANE_RED]], %[[C1]], %[[C16_1]] : f32
-// CHECK:     %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1]] : f32
+// CHECK:     %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1:.*]] : f32
 // CHECK-DAG: %[[C16_2:.*]] = arith.constant 16 : i32
 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
 // CHECK:     %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD1]], %[[C2]], %[[C16_2]] : f32
-// CHECK:     %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2]] : f32
+// CHECK:     %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2:.*]] : f32
 // CHECK-DAG: %[[C16_3:.*]] = arith.constant 16 : i32
 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
 // CHECK:     %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD2]], %[[C4]], %[[C16_3]] : f32
-// CHECK:     %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK:     %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3:.*]] : f32
 // CHECK-DAG: %[[C16_4:.*]] = arith.constant 16 : i32
 // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
 // CHECK:     %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle  xor %[[ADD3]], %[[C8]], %[[C16_4]] : f32
-// CHECK:     %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK:     %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4:.*]] : f32
 // CHECK:     %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
 gpu.func @vector_reduction() {
   %acc = arith.constant 1.0 : f32
@@ -187,19 +187,19 @@ gpu.func @vector_reduction() {
 // CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
 // CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
 // CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
-// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE:.*]] : f32
 // CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
 // CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
 // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle  xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
-// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2:.*]] : f32
 // CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
 // CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
 // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle  xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
-// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3:.*]] : f32
 // CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
 // CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
 // CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle  xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
-// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4:.*]] : f32
 // CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
 // CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
 // CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
@@ -209,19 +209,19 @@ gpu.func @vector_reduction() {
 // CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
 // CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
 // CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle  xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
-// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5:.*]] : f32
 // CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
 // CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
 // CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle  xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
-// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6:.*]] : f32
 // CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
 // CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
 // CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle  xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
-// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7:.*]] : f32
 // CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
 // CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
 // CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle  xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
-// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8:.*]] : f32
 // CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
 // CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
@@ -252,19 +252,19 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 // CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
 // CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
 // CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle  xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
-// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE:.*]] : f32
 // CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
 // CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
 // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle  xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
-// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2:.*]] : f32
 // CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
 // CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
 // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle  xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
-// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3:.*]] : f32
 // CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
 // CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
 // CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle  xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
-// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4:.*]] : f32
 // CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
 // CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
 // CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
@@ -274,19 +274,19 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 // CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
 // CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
 // CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle  xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
-// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5:.*]] : f32
 // CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
 // CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
 // CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle  xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
-// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6:.*]] : f32
 // CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
 // CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
 // CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle  xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
-// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7:.*]] : f32
 // CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
 // CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
 // CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle  xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
-// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8:.*]] : f32
 // CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
 // CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {