[Mlir-commits] [mlir] [MLIR][XeGPU] Add Layout Propagation support for multi-reduction/reduction op with scalar result (PR #189133)

Wed Apr 1 11:57:19 PDT 2026

https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/189133

>From 421b273d83b925070fe1d8912e1bf2c59f7b0ad9 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 26 Mar 2026 23:41:09 +0000
Subject: [PATCH 1/8] add propagation support for reduction to scalar

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |  14 ++-
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      |  45 +++++++-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 103 +++++++++++++++++-
 3 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 55b18d4a19c55..9c6141a4b2a3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -82,6 +82,10 @@ DistributeLayoutAttr
 inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
                                 SmallVector<int64_t> reduceDims);
 
+/// Infers the source layout attribute for a reduction operation given the
+/// result layout attribute and reduced dims.
+DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout);
+
 /// Infers the source layout attribute for a transpose operation given the
 /// result layout attribute and permutation.
 DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout,
@@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
                                     ArrayRef<int64_t> resShape,
                                     ArrayRef<int64_t> srcShape);
 
-/// Sets up layout for reduction operations by creating a SliceAttr for the
-/// result.
+/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for
+/// the result.
 ///
 /// This function first attempts to construct a source layout that, when
 /// sliced along reduction dimensions, produces a result layout compatible
@@ -122,6 +126,12 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
                                           SmallVector<int64_t> reductionDims,
                                           const uArch::uArch *uArch);
 
+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+SliceAttr setupReductionResultLayout(LayoutKind layoutKind,
+                                     VectorType srcVectorTy,
+                                     const uArch::uArch *uArch);
+
 /// Setup the result layout attribute for a bitcast operation based on element
 /// type bitwidths. This ensures the source layout can always be derived from
 /// the result layout.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index ec5751634fdff..acfbd202cb91a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
   return sliceLayout.getParent();
 }
 
+xegpu::DistributeLayoutAttr
+xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) {
+  return xegpu::inferMultiReductionSourceLayout(resLayout, {0});
+}
+
 /// Infers the source layout attribute for a transpose operation given the
 /// result layout attribute and permutation.
 xegpu::DistributeLayoutAttr
@@ -546,6 +551,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
                                DenseI64ArrayAttr::get(context, reductionDims));
 }
 
+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+xegpu::SliceAttr
+xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
+                                  VectorType srcVecTy,
+                                  const xegpu::uArch::uArch *uArch) {
+
+  auto srcShape = srcVecTy.getShape();
+  auto context = srcVecTy.getContext();
+  auto subgroupSize = uArch->getSubgroupSize();
+  xegpu::LayoutAttr srcLayout;
+
+  if (layoutKind == xegpu::LayoutKind::Subgroup) {
+    assert(true &&
+           "subgroup layout assignment not supported for insertStridedSlice.");
+  } else if (layoutKind == xegpu::LayoutKind::InstData) {
+    assert(true &&
+           "instData layout assignment not supported for insertStridedSlice.");
+  } else if (layoutKind == xegpu::LayoutKind::Lane) {
+    SmallVector<int32_t> laneLayout(1), laneData(1);
+    laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
+    laneData[0] = 1;
+    srcLayout = xegpu::LayoutAttr::get(
+        context, DenseI32ArrayAttr::get(context, laneLayout),
+        DenseI32ArrayAttr::get(context, laneData));
+  }
+
+  auto result = xegpu::SliceAttr::get(context, srcLayout,
+                                      DenseI64ArrayAttr::get(context, 0));
+  return result;
+}
+
 /// Sets up the result layout for a bitcast operation.
 /// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
 /// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
@@ -1079,7 +1116,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
   Operation *op = operand.getOwner();
   unsigned idx = operand.getOperandNumber();
   xegpu::DistributeLayoutAttr resLayout;
-  if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
+  if (op->getNumResults() == 1)
     resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
 
   // For vector::BroadcastOp, infer the source layout from the result layout.
@@ -1108,6 +1145,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
       return resLayout;
   }
 
+  if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    return xegpu::inferReductionSourceLayout(resLayout);
+  }
+
   // For vector::BitCastOp, infer source layout from result layout using
   // element type bitwidths.
   if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8675fe8b5cce1..5835fa7ebf868 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -365,6 +365,10 @@ class LayoutInfoPropagation
                                    ArrayRef<LayoutInfoLattice *> operands,
                                    ArrayRef<const LayoutInfoLattice *> results);
 
+  void visitVectorReductionOp(vector::ReductionOp reduction,
+                              ArrayRef<LayoutInfoLattice *> operands,
+                              ArrayRef<const LayoutInfoLattice *> results);
+
   void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
                               ArrayRef<LayoutInfoLattice *> operands,
                               ArrayRef<const LayoutInfoLattice *> results);
@@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
       .Case([&](vector::MultiDimReductionOp reductionOp) {
         visitVectorMultiReductionOp(reductionOp, operands, results);
       })
+      .Case([&](vector::ReductionOp reductionOp) {
+        visitVectorReductionOp(reductionOp, operands, results);
+      })
       .Case([&](vector::BroadcastOp broadcastOp) {
         visitVectorBroadCastOp(broadcastOp, operands, results);
       })
@@ -625,9 +632,10 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
     vector::MultiDimReductionOp reduction,
     ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
+  Type resultTy = reduction.getDestType();
   // The layout of the result must be present.
   LayoutInfo resLayoutInfo = results[0]->getValue();
-  if (!resLayoutInfo.isAssigned())
+  if (llvm::isa<VectorType>(resultTy) && !resLayoutInfo.isAssigned())
     return;
 
   VectorType sourceTy = reduction.getSourceVectorType();
@@ -636,8 +644,30 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   if (!uArch)
     return;
-  auto consumerLayoutAttr =
-      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+  xegpu::DistributeLayoutAttr consumerLayoutAttr;
+  if (!llvm::isa<VectorType>(resultTy)) {
+    auto sgSize = uArch->getSubgroupSize();
+    auto numSgOrErr = getNumSg(reduction, sgSize);
+    if (failed(numSgOrErr)) {
+      reduction.emitWarning(
+          "Unable to determine the number of subgroups for the operation.");
+      return;
+    }
+    auto srcShape = sourceTy.getShape();
+    int srcRank = srcShape.size();
+    SmallVector<int32_t> sgLayout(srcRank, 1);
+    SmallVector<int32_t> sgData(srcRank, 1);
+    sgLayout.back() = numSgOrErr.value();
+    MLIRContext *context = reduction.getContext();
+    consumerLayoutAttr = xegpu::LayoutAttr::get(
+        context, DenseI32ArrayAttr::get(context, sgLayout),
+        DenseI32ArrayAttr::get(context, sgData),
+        /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+        /*lane_data =*/nullptr, /*order =*/nullptr);
+  } else {
+    consumerLayoutAttr =
+        dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+  }
 
   // The result layout represents the layout requirements of the operation.
   // it is recorded to anchor layout or temporary layout.
@@ -659,6 +689,42 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
                      operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
 }
 
+void LayoutInfoPropagation::visitVectorReductionOp(
+    vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // The layout of the result must be present.
+
+  VectorType sourceTy = reduction.getSourceVectorType();
+
+  LLVM_DEBUG(DBGS() << "visitVectorReductionOp: " << reduction << "\n");
+  LLVM_DEBUG(DBGS() << "  sourceTy: " << sourceTy << "\n");
+
+  const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
+  if (!uArch)
+    return;
+
+  auto requiredResLayoutAttr =
+      xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
+
+  LLVM_DEBUG(DBGS() << "  requiredResLayoutAttr: " << requiredResLayoutAttr
+                    << "\n");
+
+  xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
+
+  // derive the source layout from the dominant layout and reduction dims
+  auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
+
+  LLVM_DEBUG(DBGS() << "  srcLayoutAttr: " << srcLayoutAttr << "\n");
+
+  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+
+  if (reduction.getAcc()) {
+    // Accumulator should have the same layout as the result.
+    propagateIfChanged(operands[1],
+                       operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
+  }
+}
+
 void LayoutInfoPropagation::visitVectorBroadCastOp(
     vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
@@ -1286,6 +1352,7 @@ struct ResolveLayoutConflicts {
   OpBuilder builder;
   LogicalResult resolveTensorDescConsumer(OpOperand &operand);
   LogicalResult resolveVectorConsumer(OpOperand &operand);
+  LogicalResult assignScalarResultLayout(OpResult &result);
 };
 
 } // namespace
@@ -1294,6 +1361,21 @@ LogicalResult ResolveLayoutConflicts::run() {
   // Scan all operations in the parent op and resolve layout conflicts at
   // tensor descriptor and vector use points.
   auto r = parentOp->walk([&](Operation *op) -> WalkResult {
+    // if the operation inputs vector and output scalar, like multi-reduction we
+    // need to check if the result has layout and add a convert_layout to serve
+    // as anchor op for the reduction op's layout.
+    if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
+      for (OpResult result : op->getResults()) {
+        if (result.getType().isIntOrFloat()) {
+          auto res = assignScalarResultLayout(result);
+          if (failed(res)) {
+            DBGS() << "Failed to resolve vector consumer for multi-reduction "
+                   << *op << "\n";
+            return WalkResult::interrupt();
+          }
+        }
+      }
+    }
     for (OpOperand &operand : op->getOpOperands()) {
       // Handle conflicts in tensor descriptor operands.
       Type operandType = operand.get().getType();
@@ -1321,6 +1403,21 @@ LogicalResult ResolveLayoutConflicts::run() {
   return r.wasInterrupted() ? failure() : success();
 }
 
+LogicalResult
+ResolveLayoutConflicts::assignScalarResultLayout(OpResult &result) {
+  Operation *ProducerOp = result.getDefiningOp();
+  // Get the current layout of the vector value.
+  auto producerLayout = xegpu::getDistributeLayoutAttr(result);
+  // Insert a convert_layout op to resolve the conflict.
+  builder.setInsertionPointAfterValue(result);
+  auto convertOp = xegpu::ConvertLayoutOp::create(
+      builder, ProducerOp->getLoc(), result.getType(), result, producerLayout,
+      producerLayout);
+  // Update the users to use the converted value.
+  result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
+  return success();
+}
+
 LogicalResult
 ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
   Value vectorValue = operand.get();

>From ba6970f273fc42567c373bf5afa57ab9eea69608 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 17:37:46 +0000
Subject: [PATCH 2/8] add tests

---
 .../XeGPU/propagate-layout-subgroup.mlir      | 14 ++++++++++
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 26 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index e4e6d61b92fda..6595feb1ba696 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -140,6 +140,20 @@ gpu.module @test {
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_row_reduction_scalar
+// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
+  gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
+    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
+    %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
+    gpu.return
+  }
+}
+
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: vector_nest_reduction
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 221e963ed9ac1..98d78997b95b3 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -746,6 +746,32 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} 0.000000e+00 : f16
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} true : i1
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : f16, memref<16xf16>, index, i1
+func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<1xi1>
+    %0 = vector.step : vector<1xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+    %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
+    %cst_2 = arith.constant true : i1
+    %cst_3 = arith.constant 1 : index
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(

>From a71c24c623795dc142f9c69645aaa10443231b28 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 19:32:25 +0000
Subject: [PATCH 3/8] cleanup and add tests

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 12 +++----
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  9 ++---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |  6 ++--
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 34 +++++++++---------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 36 ++++++++-----------
 .../XeGPU/propagate-layout-subgroup.mlir      | 11 +++---
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 10 +++---
 7 files changed, 52 insertions(+), 66 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 3526178ea5753..d87d254158c27 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
       OptionalAttr<DistributeLayoutAttr>:$layout);
-  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
+  let results = (outs XeGPU_ValueOrScalarType:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
 
@@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
 
   }];
 
-  let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value,
+  let arguments = (ins XeGPU_ValueOrScalarType:$value,
       XeGPU_GatherScatterSourceType:$dest,
       Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
@@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
             : vector<128x128xf16>
         ```
     }];
-    let arguments = (ins XeGPU_ConvertLayoutType: $source,
+    let arguments = (ins XeGPU_ValueOrScalarType: $source,
                          DistributeLayoutAttr: $input_layout,
                          DistributeLayoutAttr: $target_layout);
-    let results = (outs XeGPU_ConvertLayoutType: $result);
+    let results = (outs XeGPU_ValueOrScalarType: $result);
     let assemblyFormat = [{
         $source prop-dict attr-dict `:` type($source)
     }];
@@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     OptionalAttr<UnitAttr>:$subgroup_block_io,
     OptionalAttr<DistributeLayoutAttr>:$layout
   );
-  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
+  let results = (outs XeGPU_ValueOrScalarType:$res);
   let assemblyFormat = [{
     $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
@@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                               AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
   let arguments = (ins
-    AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
+    XeGPU_ValueOrScalarType:$data,
     XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f41c0bf1fd2b2..f8b0445e3e2a5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -25,12 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
 def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
 def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
 def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
-def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
-def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_VectorOrOffsetVectorType
-    : VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
-def XeGPU_ConvertLayoutType
-    : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
+def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
+def XeGPU_ValueOrScalarType
+    : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
 def XeGPU_GatherScatterBaseAddrType
     : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 9c6141a4b2a3b..9cf9a8705209b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -124,7 +124,7 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
                                           VectorType srcVectorTy,
                                           DistributeLayoutAttr consumerLayout,
                                           SmallVector<int64_t> reductionDims,
-                                          const uArch::uArch *uArch);
+                                          int numSg, const uArch::uArch *uArch);
 
 /// Sets up layout for Reduction operations by creating a SliceAttr for the
 /// result.
@@ -180,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
 std::optional<std::tuple<DistributeLayoutAttr, DistributeLayoutAttr,
                          DistributeLayoutAttr>>
 setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
-                VectorType cdTy, DistributeLayoutAttr consumerLayout,
-                const uArch::uArch *uArch, int numSg);
+                VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
+                const uArch::uArch *uArch);
 
 /// Gets the expected layout for a given consumer operand. This will check if
 /// the owning operation of the consumer operand is one of the special layout
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index acfbd202cb91a..cd5ac6567e0bc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -404,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 /// Examples:
 ///   1. Subgroup layout - Row reduction on 2D tensor:
 ///      srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
-///      workgroupSize=32
+///      NumSg=32
 ///      * Consumer Layout:
 ///        #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
 ///        [1]>}
@@ -445,11 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     xegpu::LayoutKind layoutKind, VectorType srcVecTy,
     DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
-    const xegpu::uArch::uArch *uArch) {
+    int numSg, const xegpu::uArch::uArch *uArch) {
 
   auto srcShape = srcVecTy.getShape();
   int srcRank = srcShape.size();
-  auto context = consumerLayout.getContext();
+  auto context = srcVecTy.getContext();
 
   // Reduction layout requires at least 2D tensors
   if (srcRank < 2)
@@ -461,21 +461,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     return DenseI32ArrayAttr::get(context, vec32);
   };
 
-  const int workgroupSize = consumerLayout.getNumSubgroups();
   const int subgroupSize = uArch->getSubgroupSize();
   int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
-
-  SmallVector<int64_t> consumerSgLayout =
-      consumerLayout.getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> consumerLaneLayout =
-      consumerLayout.getEffectiveLaneLayoutAsInt();
-  SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
-  DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
-
   xegpu::DistributeLayoutAttr srcLayout;
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
     xegpu::SliceAttr consumerSliceLayout =
-        dyn_cast<xegpu::SliceAttr>(consumerLayout);
+        consumerLayout ? dyn_cast<xegpu::SliceAttr>(consumerLayout) : nullptr;
     if (consumerSliceLayout &&
         consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
       srcLayout = consumerSliceLayout.getParent();
@@ -487,9 +478,16 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
           srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
         }
     } else {
-
+      SmallVector<int64_t> consumerSgLayout =
+          consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
+                         : SmallVector<int64_t>();
+      SmallVector<int64_t> consumerOrder =
+          consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
+                         : SmallVector<int64_t>();
+      DenseI32ArrayAttr orderAttr =
+          consumerLayout ? consumerLayout.getOrder() : nullptr;
       SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
-      int remainingSgCount = workgroupSize;
+      int remainingSgCount = numSg;
       int consumerIdx = 0;
 
       // First pass: Match consumer's layout on non-reduction dimensions
@@ -507,6 +505,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
       }
 
       // Second pass: Distribute remaining subgroups across reduction dimensions
+      // the reduction to scalar case is handled only by this loop
       int64_t remainOrder = consumerSgLayout.size();
       for (int i = 0; i < srcRank; i++) {
         if (llvm::is_contained(reductionDims, i)) {
@@ -535,7 +534,6 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     instData[srcRank - 1] =
         std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
-
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
 
     SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
@@ -967,8 +965,8 @@ std::optional<
                xegpu::DistributeLayoutAttr>>
 xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
                        VectorType bTy, VectorType cdTy,
-                       xegpu::DistributeLayoutAttr consumerLayout,
-                       const xegpu::uArch::uArch *uArch, int numSg) {
+                       xegpu::DistributeLayoutAttr consumerLayout, int numSg,
+                       const xegpu::uArch::uArch *uArch) {
   auto context = aTy.getContext();
   const auto *uArchInstruction =
       dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 5835fa7ebf868..cbc590e71f1e5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -635,8 +635,14 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   Type resultTy = reduction.getDestType();
   // The layout of the result must be present.
   LayoutInfo resLayoutInfo = results[0]->getValue();
-  if (llvm::isa<VectorType>(resultTy) && !resLayoutInfo.isAssigned())
-    return;
+
+  xegpu::DistributeLayoutAttr consumerLayoutAttr;
+  if (!resultTy.isIntOrFloat())
+    if (!resLayoutInfo.isAssigned())
+      return;
+    else
+      consumerLayoutAttr =
+          dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
 
   VectorType sourceTy = reduction.getSourceVectorType();
   SmallVector<int64_t> reductionDims(reduction.getReductionDims());
@@ -644,29 +650,15 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   if (!uArch)
     return;
-  xegpu::DistributeLayoutAttr consumerLayoutAttr;
-  if (!llvm::isa<VectorType>(resultTy)) {
-    auto sgSize = uArch->getSubgroupSize();
-    auto numSgOrErr = getNumSg(reduction, sgSize);
+  int numSg = 0;
+  if (layoutKind == xegpu::LayoutKind::Subgroup) {
+    auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
     if (failed(numSgOrErr)) {
       reduction.emitWarning(
           "Unable to determine the number of subgroups for the operation.");
       return;
     }
-    auto srcShape = sourceTy.getShape();
-    int srcRank = srcShape.size();
-    SmallVector<int32_t> sgLayout(srcRank, 1);
-    SmallVector<int32_t> sgData(srcRank, 1);
-    sgLayout.back() = numSgOrErr.value();
-    MLIRContext *context = reduction.getContext();
-    consumerLayoutAttr = xegpu::LayoutAttr::get(
-        context, DenseI32ArrayAttr::get(context, sgLayout),
-        DenseI32ArrayAttr::get(context, sgData),
-        /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
-        /*lane_data =*/nullptr, /*order =*/nullptr);
-  } else {
-    consumerLayoutAttr =
-        dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+    numSg = numSgOrErr.value();
   }
 
   // The result layout represents the layout requirements of the operation.
@@ -675,7 +667,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   // propagated from consumer op, the conflict is resolved in later phase by
   // converting the required result layout to the consumer layout
   auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
-      layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
+      layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);
 
   xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
 
@@ -831,7 +823,7 @@ void LayoutInfoPropagation::visitDpasOp(
       numSg = numSgOrErr.value();
     }
     auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
-                                          consumerLayoutAttr, uArch, numSg);
+                                          consumerLayoutAttr, numSg, uArch);
     if (!layouts.has_value()) {
       dpas.emitWarning(
           "Failed to determine required layouts for DPAS operands.");
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index 6595feb1ba696..bb387b4cfb093 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -128,7 +128,7 @@ gpu.module @test {
 gpu.module @test {
 // CHECK-LABEL: vector_row_reduction
 // CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
-  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
     %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
     %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@@ -144,8 +144,8 @@ gpu.module @test {
 gpu.module @test {
 // CHECK-LABEL: vector_row_reduction_scalar
 // CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
-  gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
-    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+  gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
+    %cst = arith.constant 0.000000e+00 : f32
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
     %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
     %reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
@@ -153,11 +153,10 @@ gpu.module @test {
   }
 }
 
-
 // -----
 gpu.module @test {
 // CHECK-LABEL: vector_nest_reduction
-  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
     %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
     %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@@ -195,7 +194,7 @@ gpu.module @test {
 // CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
 // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
 // CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
-  gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
     %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
     %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
     %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 98d78997b95b3..d049a1e57acf2 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -753,11 +753,11 @@ gpu.module @test {
 // CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
 // CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
 // CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} 0.000000e+00 : f16
+// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16
 // CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} true : i1
-// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} 1 : index
-// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : f16, memref<16xf16>, index, i1
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
 func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
     %cst = arith.constant dense<true> : vector<1xi1>
     %0 = vector.step : vector<1xindex>
@@ -765,7 +765,7 @@ func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16x
     %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
     %cst_0 = arith.constant 0.000000e+00 : f16
     %4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
-    %cst_2 = arith.constant true : i1
+    %cst_2 = arith.constant true
     %cst_3 = arith.constant 1 : index
     xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
     return

>From 9825526aa9368422a1357861d19df82bdfb37764 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 23:28:29 +0000
Subject: [PATCH 4/8] fix bugj

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td            | 4 ++--
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td          | 5 +++--
 mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp     | 3 ++-
 .../lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 8 ++------
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d87d254158c27..e001419257d8f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
             : vector<128x128xf16>
         ```
     }];
-    let arguments = (ins XeGPU_ValueOrScalarType: $source,
+    let arguments = (ins XeGPU_VectorOrScalarType: $source,
                          DistributeLayoutAttr: $input_layout,
                          DistributeLayoutAttr: $target_layout);
-    let results = (outs XeGPU_ValueOrScalarType: $result);
+    let results = (outs XeGPU_VectorOrScalarType: $result);
     let assemblyFormat = [{
         $source prop-dict attr-dict `:` type($source)
     }];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f8b0445e3e2a5..7e142b20c0894 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -26,8 +26,9 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
 def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
 def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
 def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_ValueOrScalarType
-    : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
+def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
+def XeGPU_VectorOrScalarType
+    : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
 def XeGPU_GatherScatterBaseAddrType
     : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index cd5ac6567e0bc..67d48ee6a105e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -487,7 +487,8 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
       DenseI32ArrayAttr orderAttr =
           consumerLayout ? consumerLayout.getOrder() : nullptr;
       SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
-      int remainingSgCount = numSg;
+      int remainingSgCount =
+          consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
       int consumerIdx = 0;
 
       // First pass: Match consumer's layout on non-reduction dimensions
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index cbc590e71f1e5..fe22a46bcebf4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -653,12 +653,8 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   int numSg = 0;
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
     auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
-    if (failed(numSgOrErr)) {
-      reduction.emitWarning(
-          "Unable to determine the number of subgroups for the operation.");
-      return;
-    }
-    numSg = numSgOrErr.value();
+    if (succeeded(numSgOrErr))
+      numSg = numSgOrErr.value();
   }
 
   // The result layout represents the layout requirements of the operation.

>From d7b4933fc0cbfda35c3de987350ff808a33065ba Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 23:38:20 +0000
Subject: [PATCH 5/8] add test

---
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index d049a1e57acf2..26936dab2fb38 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -772,6 +772,28 @@ func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16x
   }
 }
 
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>} : vector<16xf16> into f16
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
+func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<16xi1>
+    %0 = vector.step : vector<16xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+    %4 = vector.reduction <add>, %1: vector<16xf16> into f16
+    %cst_2 = arith.constant true
+    %cst_3 = arith.constant 1 : index
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(

>From 82dfeffd7eb2cf0c6218fba0a29b9d0d10a0a1e7 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 30 Mar 2026 22:29:04 +0000
Subject: [PATCH 6/8] fixing issues

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 29 +++++++++----------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  8 ++---
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 67d48ee6a105e..b7dbfc0defc92 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -451,10 +451,6 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
   int srcRank = srcShape.size();
   auto context = srcVecTy.getContext();
 
-  // Reduction layout requires at least 2D tensors
-  if (srcRank < 2)
-    return nullptr;
-
   // Helper lambda to convert int64 vectors to int32 DenseArrayAttr
   auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
     SmallVector<int32_t> vec32(vec.begin(), vec.end());
@@ -532,16 +528,18 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
     SmallVector<int64_t> instData(srcRank, 1);
     instData[srcRank - 2] =
         std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
-    instData[srcRank - 1] =
-        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
+    if (srcRank >= 2)
+      instData[srcRank - 1] =
+          std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
 
     SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
     laneLayout[srcRank - 1] =
         std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
-    laneData[srcRank - 2] =
-        std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+    if (srcRank >= 2)
+      laneData[srcRank - 2] =
+          std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
     srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
                                        toInt32Attr(laneData));
   }
@@ -688,13 +686,14 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
     }
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
     for (int dim = 0; dim < srcRank; dim++) {
-      assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
-             "srcShape must be divisible by laneLayout for all dimensions");
-      laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
-                               consumerLaneData[dim]);
-
-      requiredResLayout =
-          requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
+      if (consumerLaneData[dim] != srcShape[dim]) {
+        assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
+               "srcShape must be divisible by laneLayout for all dimensions");
+        laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
+                                 consumerLaneData[dim]);
+        requiredResLayout =
+            requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
+      }
     }
   }
   return requiredResLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index fe22a46bcebf4..a35f94e7be561 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -637,12 +637,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
   LayoutInfo resLayoutInfo = results[0]->getValue();
 
   xegpu::DistributeLayoutAttr consumerLayoutAttr;
-  if (!resultTy.isIntOrFloat())
+  if (!resultTy.isIntOrFloat()) {
     if (!resLayoutInfo.isAssigned())
       return;
-    else
-      consumerLayoutAttr =
-          dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+    consumerLayoutAttr =
+        dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+  }
 
   VectorType sourceTy = reduction.getSourceVectorType();
   SmallVector<int64_t> reductionDims(reduction.getReductionDims());

>From b57ac5b1812078d98eb04bed9c9116fb40633adf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 1 Apr 2026 18:22:48 +0000
Subject: [PATCH 7/8] address feedback

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 32 +++++++++----------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 15 ++++-----
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b7dbfc0defc92..55cd6ec04970c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -462,7 +462,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
   xegpu::DistributeLayoutAttr srcLayout;
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
     xegpu::SliceAttr consumerSliceLayout =
-        consumerLayout ? dyn_cast<xegpu::SliceAttr>(consumerLayout) : nullptr;
+        dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
     if (consumerSliceLayout &&
         consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
       srcLayout = consumerSliceLayout.getParent();
@@ -526,11 +526,11 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
   } else if (layoutKind == xegpu::LayoutKind::InstData) {
 
     SmallVector<int64_t> instData(srcRank, 1);
-    instData[srcRank - 2] =
-        std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
     if (srcRank >= 2)
-      instData[srcRank - 1] =
-          std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
+      instData[srcRank - 2] =
+          std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+    instData[srcRank - 1] =
+        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
     srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
 
@@ -561,11 +561,11 @@ xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
   xegpu::LayoutAttr srcLayout;
 
   if (layoutKind == xegpu::LayoutKind::Subgroup) {
-    assert(true &&
-           "subgroup layout assignment not supported for insertStridedSlice.");
+    assert(true && "subgroup layout assignment not supported for reduction (op "
+                   "is not expected at this level).");
   } else if (layoutKind == xegpu::LayoutKind::InstData) {
-    assert(true &&
-           "instData layout assignment not supported for insertStridedSlice.");
+    assert(true && "instData layout assignment not supported for reduction (op "
+                   "is not expected at this level).");
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
     SmallVector<int32_t> laneLayout(1), laneData(1);
     laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
@@ -686,14 +686,12 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
     }
   } else if (layoutKind == xegpu::LayoutKind::Lane) {
     for (int dim = 0; dim < srcRank; dim++) {
-      if (consumerLaneData[dim] != srcShape[dim]) {
-        assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
-               "srcShape must be divisible by laneLayout for all dimensions");
-        laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
-                                 consumerLaneData[dim]);
-        requiredResLayout =
-            requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
-      }
+      assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
+             "srcShape must be divisible by laneLayout for all dimensions");
+      laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
+                               consumerLaneData[dim]);
+      requiredResLayout =
+          requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
     }
   }
   return requiredResLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index a35f94e7be561..95a4a33db48df 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1340,7 +1340,7 @@ struct ResolveLayoutConflicts {
   OpBuilder builder;
   LogicalResult resolveTensorDescConsumer(OpOperand &operand);
   LogicalResult resolveVectorConsumer(OpOperand &operand);
-  LogicalResult assignScalarResultLayout(OpResult &result);
+  LogicalResult assignResultLayout(OpResult &result);
 };
 
 } // namespace
@@ -1355,7 +1355,7 @@ LogicalResult ResolveLayoutConflicts::run() {
     if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
       for (OpResult result : op->getResults()) {
         if (result.getType().isIntOrFloat()) {
-          auto res = assignScalarResultLayout(result);
+          auto res = assignResultLayout(result);
           if (failed(res)) {
             DBGS() << "Failed to resolve vector consumer for multi-reduction "
                    << *op << "\n";
@@ -1391,17 +1391,14 @@ LogicalResult ResolveLayoutConflicts::run() {
   return r.wasInterrupted() ? failure() : success();
 }
 
-LogicalResult
-ResolveLayoutConflicts::assignScalarResultLayout(OpResult &result) {
-  Operation *ProducerOp = result.getDefiningOp();
-  // Get the current layout of the vector value.
+LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
+  Operation *producerOp = result.getDefiningOp();
   auto producerLayout = xegpu::getDistributeLayoutAttr(result);
-  // Insert a convert_layout op to resolve the conflict.
+  // Insert a convert_layout op to assign the layout.
   builder.setInsertionPointAfterValue(result);
   auto convertOp = xegpu::ConvertLayoutOp::create(
-      builder, ProducerOp->getLoc(), result.getType(), result, producerLayout,
+      builder, producerOp->getLoc(), result.getType(), result, producerLayout,
       producerLayout);
-  // Update the users to use the converted value.
   result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
   return success();
 }

>From 391a0f40796ddc4daca181eeb99d97fb34689626 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 1 Apr 2026 18:57:01 +0000
Subject: [PATCH 8/8] remove debug print

---
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp  | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 95a4a33db48df..4c30dacae8850 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -680,37 +680,21 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
 void LayoutInfoPropagation::visitVectorReductionOp(
     vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  // The layout of the result must be present.
 
   VectorType sourceTy = reduction.getSourceVectorType();
-
-  LLVM_DEBUG(DBGS() << "visitVectorReductionOp: " << reduction << "\n");
-  LLVM_DEBUG(DBGS() << "  sourceTy: " << sourceTy << "\n");
-
   const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   if (!uArch)
     return;
 
   auto requiredResLayoutAttr =
       xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
-
-  LLVM_DEBUG(DBGS() << "  requiredResLayoutAttr: " << requiredResLayoutAttr
-                    << "\n");
-
   xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
 
-  // derive the source layout from the dominant layout and reduction dims
   auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
-
-  LLVM_DEBUG(DBGS() << "  srcLayoutAttr: " << srcLayoutAttr << "\n");
-
   propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
-
-  if (reduction.getAcc()) {
-    // Accumulator should have the same layout as the result.
+  if (reduction.getAcc())
     propagateIfChanged(operands[1],
                        operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
-  }
 }
 
 void LayoutInfoPropagation::visitVectorBroadCastOp(