[Mlir-commits] [mlir] [MLIR][XeGPU] Add Layout Propagation support for multi-reduction/reduction op with scalar result (PR #189133)
Jianhui Li
llvmlistbot at llvm.org
Wed Apr 1 11:57:19 PDT 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/189133
>From 421b273d83b925070fe1d8912e1bf2c59f7b0ad9 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 26 Mar 2026 23:41:09 +0000
Subject: [PATCH 1/8] add propagation support for reduction to scalar
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 14 ++-
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 45 +++++++-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 103 +++++++++++++++++-
3 files changed, 156 insertions(+), 6 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 55b18d4a19c55..9c6141a4b2a3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -82,6 +82,10 @@ DistributeLayoutAttr
inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
SmallVector<int64_t> reduceDims);
+/// Infers the source layout attribute for a reduction operation given the
+/// result layout attribute and reduced dims.
+DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout);
+
/// Infers the source layout attribute for a transpose operation given the
/// result layout attribute and permutation.
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout,
@@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
ArrayRef<int64_t> srcShape);
-/// Sets up layout for reduction operations by creating a SliceAttr for the
-/// result.
+/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for
+/// the result.
///
/// This function first attempts to construct a source layout that, when
/// sliced along reduction dimensions, produces a result layout compatible
@@ -122,6 +126,12 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
SmallVector<int64_t> reductionDims,
const uArch::uArch *uArch);
+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+SliceAttr setupReductionResultLayout(LayoutKind layoutKind,
+ VectorType srcVectorTy,
+ const uArch::uArch *uArch);
+
/// Setup the result layout attribute for a bitcast operation based on element
/// type bitwidths. This ensures the source layout can always be derived from
/// the result layout.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index ec5751634fdff..acfbd202cb91a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
return sliceLayout.getParent();
}
+xegpu::DistributeLayoutAttr
+xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) {
+ return xegpu::inferMultiReductionSourceLayout(resLayout, {0});
+}
+
/// Infers the source layout attribute for a transpose operation given the
/// result layout attribute and permutation.
xegpu::DistributeLayoutAttr
@@ -546,6 +551,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
DenseI64ArrayAttr::get(context, reductionDims));
}
+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+xegpu::SliceAttr
+xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
+ VectorType srcVecTy,
+ const xegpu::uArch::uArch *uArch) {
+
+ auto srcShape = srcVecTy.getShape();
+ auto context = srcVecTy.getContext();
+ auto subgroupSize = uArch->getSubgroupSize();
+ xegpu::LayoutAttr srcLayout;
+
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ assert(true &&
+ "subgroup layout assignment not supported for insertStridedSlice.");
+ } else if (layoutKind == xegpu::LayoutKind::InstData) {
+ assert(true &&
+ "instData layout assignment not supported for insertStridedSlice.");
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ SmallVector<int32_t> laneLayout(1), laneData(1);
+ laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
+ laneData[0] = 1;
+ srcLayout = xegpu::LayoutAttr::get(
+ context, DenseI32ArrayAttr::get(context, laneLayout),
+ DenseI32ArrayAttr::get(context, laneData));
+ }
+
+ auto result = xegpu::SliceAttr::get(context, srcLayout,
+ DenseI64ArrayAttr::get(context, 0));
+ return result;
+}
+
/// Sets up the result layout for a bitcast operation.
/// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
/// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
@@ -1079,7 +1116,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
Operation *op = operand.getOwner();
unsigned idx = operand.getOperandNumber();
xegpu::DistributeLayoutAttr resLayout;
- if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
+ if (op->getNumResults() == 1)
resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
// For vector::BroadcastOp, infer the source layout from the result layout.
@@ -1108,6 +1145,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
return resLayout;
}
+ if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
+ return xegpu::inferReductionSourceLayout(resLayout);
+ }
+
// For vector::BitCastOp, infer source layout from result layout using
// element type bitwidths.
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8675fe8b5cce1..5835fa7ebf868 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -365,6 +365,10 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
+ void visitVectorReductionOp(vector::ReductionOp reduction,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
+
void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](vector::MultiDimReductionOp reductionOp) {
visitVectorMultiReductionOp(reductionOp, operands, results);
})
+ .Case([&](vector::ReductionOp reductionOp) {
+ visitVectorReductionOp(reductionOp, operands, results);
+ })
.Case([&](vector::BroadcastOp broadcastOp) {
visitVectorBroadCastOp(broadcastOp, operands, results);
})
@@ -625,9 +632,10 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
vector::MultiDimReductionOp reduction,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
+ Type resultTy = reduction.getDestType();
// The layout of the result must be present.
LayoutInfo resLayoutInfo = results[0]->getValue();
- if (!resLayoutInfo.isAssigned())
+ if (llvm::isa<VectorType>(resultTy) && !resLayoutInfo.isAssigned())
return;
VectorType sourceTy = reduction.getSourceVectorType();
@@ -636,8 +644,30 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
if (!uArch)
return;
- auto consumerLayoutAttr =
- dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ xegpu::DistributeLayoutAttr consumerLayoutAttr;
+ if (!llvm::isa<VectorType>(resultTy)) {
+ auto sgSize = uArch->getSubgroupSize();
+ auto numSgOrErr = getNumSg(reduction, sgSize);
+ if (failed(numSgOrErr)) {
+ reduction.emitWarning(
+ "Unable to determine the number of subgroups for the operation.");
+ return;
+ }
+ auto srcShape = sourceTy.getShape();
+ int srcRank = srcShape.size();
+ SmallVector<int32_t> sgLayout(srcRank, 1);
+ SmallVector<int32_t> sgData(srcRank, 1);
+ sgLayout.back() = numSgOrErr.value();
+ MLIRContext *context = reduction.getContext();
+ consumerLayoutAttr = xegpu::LayoutAttr::get(
+ context, DenseI32ArrayAttr::get(context, sgLayout),
+ DenseI32ArrayAttr::get(context, sgData),
+ /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+ /*lane_data =*/nullptr, /*order =*/nullptr);
+ } else {
+ consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ }
// The result layout represents the layout requirements of the operation.
// it is recorded to anchor layout or temporary layout.
@@ -659,6 +689,42 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}
+void LayoutInfoPropagation::visitVectorReductionOp(
+ vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // The layout of the result must be present.
+
+ VectorType sourceTy = reduction.getSourceVectorType();
+
+ LLVM_DEBUG(DBGS() << "visitVectorReductionOp: " << reduction << "\n");
+ LLVM_DEBUG(DBGS() << " sourceTy: " << sourceTy << "\n");
+
+ const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
+ if (!uArch)
+ return;
+
+ auto requiredResLayoutAttr =
+ xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
+
+ LLVM_DEBUG(DBGS() << " requiredResLayoutAttr: " << requiredResLayoutAttr
+ << "\n");
+
+ xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
+
+ // derive the source layout from the dominant layout and reduction dims
+ auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
+
+ LLVM_DEBUG(DBGS() << " srcLayoutAttr: " << srcLayoutAttr << "\n");
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+
+ if (reduction.getAcc()) {
+ // Accumulator should have the same layout as the result.
+ propagateIfChanged(operands[1],
+ operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
+ }
+}
+
void LayoutInfoPropagation::visitVectorBroadCastOp(
vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
@@ -1286,6 +1352,7 @@ struct ResolveLayoutConflicts {
OpBuilder builder;
LogicalResult resolveTensorDescConsumer(OpOperand &operand);
LogicalResult resolveVectorConsumer(OpOperand &operand);
+ LogicalResult assignScalarResultLayout(OpResult &result);
};
} // namespace
@@ -1294,6 +1361,21 @@ LogicalResult ResolveLayoutConflicts::run() {
// Scan all operations in the parent op and resolve layout conflicts at
// tensor descriptor and vector use points.
auto r = parentOp->walk([&](Operation *op) -> WalkResult {
+ // if the operation inputs vector and output scalar, like multi-reduction we
+ // need to check if the result has layout and add a convert_layout to serve
+ // as anchor op for the reduction op's layout.
+ if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
+ for (OpResult result : op->getResults()) {
+ if (result.getType().isIntOrFloat()) {
+ auto res = assignScalarResultLayout(result);
+ if (failed(res)) {
+ DBGS() << "Failed to resolve vector consumer for multi-reduction "
+ << *op << "\n";
+ return WalkResult::interrupt();
+ }
+ }
+ }
+ }
for (OpOperand &operand : op->getOpOperands()) {
// Handle conflicts in tensor descriptor operands.
Type operandType = operand.get().getType();
@@ -1321,6 +1403,21 @@ LogicalResult ResolveLayoutConflicts::run() {
return r.wasInterrupted() ? failure() : success();
}
+LogicalResult
+ResolveLayoutConflicts::assignScalarResultLayout(OpResult &result) {
+ Operation *ProducerOp = result.getDefiningOp();
+ // Get the current layout of the vector value.
+ auto producerLayout = xegpu::getDistributeLayoutAttr(result);
+ // Insert a convert_layout op to resolve the conflict.
+ builder.setInsertionPointAfterValue(result);
+ auto convertOp = xegpu::ConvertLayoutOp::create(
+ builder, ProducerOp->getLoc(), result.getType(), result, producerLayout,
+ producerLayout);
+ // Update the users to use the converted value.
+ result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
+ return success();
+}
+
LogicalResult
ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
Value vectorValue = operand.get();
>From ba6970f273fc42567c373bf5afa57ab9eea69608 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 17:37:46 +0000
Subject: [PATCH 2/8] add tests
---
.../XeGPU/propagate-layout-subgroup.mlir | 14 ++++++++++
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 26 +++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index e4e6d61b92fda..6595feb1ba696 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -140,6 +140,20 @@ gpu.module @test {
}
}
+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_row_reduction_scalar
+// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
+ gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+ %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
+ %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
+ gpu.return
+ }
+}
+
+
// -----
gpu.module @test {
// CHECK-LABEL: vector_nest_reduction
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 221e963ed9ac1..98d78997b95b3 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -746,6 +746,32 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
}
}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} 0.000000e+00 : f16
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} true : i1
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : f16, memref<16xf16>, index, i1
+func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<1xi1>
+ %0 = vector.step : vector<1xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+ %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
+ %cst_0 = arith.constant 0.000000e+00 : f16
+ %4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
+ %cst_2 = arith.constant true : i1
+ %cst_3 = arith.constant 1 : index
+ xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+ return
+ }
+}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(
>From a71c24c623795dc142f9c69645aaa10443231b28 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 19:32:25 +0000
Subject: [PATCH 3/8] cleanup and add tests
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 12 +++----
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 9 ++---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 6 ++--
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 34 +++++++++---------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 36 ++++++++-----------
.../XeGPU/propagate-layout-subgroup.mlir | 11 +++---
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 10 +++---
7 files changed, 52 insertions(+), 66 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 3526178ea5753..d87d254158c27 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
OptionalAttr<DistributeLayoutAttr>:$layout);
- let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
+ let results = (outs XeGPU_ValueOrScalarType:$value);
let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
}];
- let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value,
+ let arguments = (ins XeGPU_ValueOrScalarType:$value,
XeGPU_GatherScatterSourceType:$dest,
Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
@@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
: vector<128x128xf16>
```
}];
- let arguments = (ins XeGPU_ConvertLayoutType: $source,
+ let arguments = (ins XeGPU_ValueOrScalarType: $source,
DistributeLayoutAttr: $input_layout,
DistributeLayoutAttr: $target_layout);
- let results = (outs XeGPU_ConvertLayoutType: $result);
+ let results = (outs XeGPU_ValueOrScalarType: $result);
let assemblyFormat = [{
$source prop-dict attr-dict `:` type($source)
}];
@@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
OptionalAttr<UnitAttr>:$subgroup_block_io,
OptionalAttr<DistributeLayoutAttr>:$layout
);
- let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
+ let results = (outs XeGPU_ValueOrScalarType:$res);
let assemblyFormat = [{
$mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
prop-dict attr-dict `` `:` type(operands) `->` type(results)
@@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
let arguments = (ins
- AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
+ XeGPU_ValueOrScalarType:$data,
XeGPU_MemDesc:$mem_desc,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f41c0bf1fd2b2..f8b0445e3e2a5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -25,12 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
-def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
-def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_VectorOrOffsetVectorType
- : VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
-def XeGPU_ConvertLayoutType
- : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
+def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
+def XeGPU_ValueOrScalarType
+ : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
def XeGPU_GatherScatterBaseAddrType
: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 9c6141a4b2a3b..9cf9a8705209b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -124,7 +124,7 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
VectorType srcVectorTy,
DistributeLayoutAttr consumerLayout,
SmallVector<int64_t> reductionDims,
- const uArch::uArch *uArch);
+ int numSg, const uArch::uArch *uArch);
/// Sets up layout for Reduction operations by creating a SliceAttr for the
/// result.
@@ -180,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
std::optional<std::tuple<DistributeLayoutAttr, DistributeLayoutAttr,
DistributeLayoutAttr>>
setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
- VectorType cdTy, DistributeLayoutAttr consumerLayout,
- const uArch::uArch *uArch, int numSg);
+ VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
+ const uArch::uArch *uArch);
/// Gets the expected layout for a given consumer operand. This will check if
/// the owning operation of the consumer operand is one of the special layout
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index acfbd202cb91a..cd5ac6567e0bc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -404,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
/// Examples:
/// 1. Subgroup layout - Row reduction on 2D tensor:
/// srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
-/// workgroupSize=32
+/// NumSg=32
/// * Consumer Layout:
/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
/// [1]>}
@@ -445,11 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
xegpu::LayoutKind layoutKind, VectorType srcVecTy,
DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
- const xegpu::uArch::uArch *uArch) {
+ int numSg, const xegpu::uArch::uArch *uArch) {
auto srcShape = srcVecTy.getShape();
int srcRank = srcShape.size();
- auto context = consumerLayout.getContext();
+ auto context = srcVecTy.getContext();
// Reduction layout requires at least 2D tensors
if (srcRank < 2)
@@ -461,21 +461,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
return DenseI32ArrayAttr::get(context, vec32);
};
- const int workgroupSize = consumerLayout.getNumSubgroups();
const int subgroupSize = uArch->getSubgroupSize();
int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
-
- SmallVector<int64_t> consumerSgLayout =
- consumerLayout.getEffectiveSgLayoutAsInt();
- SmallVector<int64_t> consumerLaneLayout =
- consumerLayout.getEffectiveLaneLayoutAsInt();
- SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
- DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
-
xegpu::DistributeLayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
xegpu::SliceAttr consumerSliceLayout =
- dyn_cast<xegpu::SliceAttr>(consumerLayout);
+ consumerLayout ? dyn_cast<xegpu::SliceAttr>(consumerLayout) : nullptr;
if (consumerSliceLayout &&
consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
srcLayout = consumerSliceLayout.getParent();
@@ -487,9 +478,16 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
}
} else {
-
+ SmallVector<int64_t> consumerSgLayout =
+ consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
+ : SmallVector<int64_t>();
+ SmallVector<int64_t> consumerOrder =
+ consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
+ : SmallVector<int64_t>();
+ DenseI32ArrayAttr orderAttr =
+ consumerLayout ? consumerLayout.getOrder() : nullptr;
SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
- int remainingSgCount = workgroupSize;
+ int remainingSgCount = numSg;
int consumerIdx = 0;
// First pass: Match consumer's layout on non-reduction dimensions
@@ -507,6 +505,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
}
// Second pass: Distribute remaining subgroups across reduction dimensions
+ // the reduction to scalar case is handled only by this loop
int64_t remainOrder = consumerSgLayout.size();
for (int i = 0; i < srcRank; i++) {
if (llvm::is_contained(reductionDims, i)) {
@@ -535,7 +534,6 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
instData[srcRank - 1] =
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
-
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
@@ -967,8 +965,8 @@ std::optional<
xegpu::DistributeLayoutAttr>>
xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
VectorType bTy, VectorType cdTy,
- xegpu::DistributeLayoutAttr consumerLayout,
- const xegpu::uArch::uArch *uArch, int numSg) {
+ xegpu::DistributeLayoutAttr consumerLayout, int numSg,
+ const xegpu::uArch::uArch *uArch) {
auto context = aTy.getContext();
const auto *uArchInstruction =
dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 5835fa7ebf868..cbc590e71f1e5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -635,8 +635,14 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
Type resultTy = reduction.getDestType();
// The layout of the result must be present.
LayoutInfo resLayoutInfo = results[0]->getValue();
- if (llvm::isa<VectorType>(resultTy) && !resLayoutInfo.isAssigned())
- return;
+
+ xegpu::DistributeLayoutAttr consumerLayoutAttr;
+ if (!resultTy.isIntOrFloat())
+ if (!resLayoutInfo.isAssigned())
+ return;
+ else
+ consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
VectorType sourceTy = reduction.getSourceVectorType();
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
@@ -644,29 +650,15 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
if (!uArch)
return;
- xegpu::DistributeLayoutAttr consumerLayoutAttr;
- if (!llvm::isa<VectorType>(resultTy)) {
- auto sgSize = uArch->getSubgroupSize();
- auto numSgOrErr = getNumSg(reduction, sgSize);
+ int numSg = 0;
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
if (failed(numSgOrErr)) {
reduction.emitWarning(
"Unable to determine the number of subgroups for the operation.");
return;
}
- auto srcShape = sourceTy.getShape();
- int srcRank = srcShape.size();
- SmallVector<int32_t> sgLayout(srcRank, 1);
- SmallVector<int32_t> sgData(srcRank, 1);
- sgLayout.back() = numSgOrErr.value();
- MLIRContext *context = reduction.getContext();
- consumerLayoutAttr = xegpu::LayoutAttr::get(
- context, DenseI32ArrayAttr::get(context, sgLayout),
- DenseI32ArrayAttr::get(context, sgData),
- /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
- /*lane_data =*/nullptr, /*order =*/nullptr);
- } else {
- consumerLayoutAttr =
- dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ numSg = numSgOrErr.value();
}
// The result layout represents the layout requirements of the operation.
@@ -675,7 +667,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
// propagated from consumer op, the conflict is resolved in later phase by
// converting the required result layout to the consumer layout
auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
- layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
+ layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
@@ -831,7 +823,7 @@ void LayoutInfoPropagation::visitDpasOp(
numSg = numSgOrErr.value();
}
auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
- consumerLayoutAttr, uArch, numSg);
+ consumerLayoutAttr, numSg, uArch);
if (!layouts.has_value()) {
dpas.emitWarning(
"Failed to determine required layouts for DPAS operands.");
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index 6595feb1ba696..bb387b4cfb093 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -128,7 +128,7 @@ gpu.module @test {
gpu.module @test {
// CHECK-LABEL: vector_row_reduction
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
- gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
+ gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@@ -144,8 +144,8 @@ gpu.module @test {
gpu.module @test {
// CHECK-LABEL: vector_row_reduction_scalar
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
- gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
- %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+ gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
+ %cst = arith.constant 0.000000e+00 : f32
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
%reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
@@ -153,11 +153,10 @@ gpu.module @test {
}
}
-
// -----
gpu.module @test {
// CHECK-LABEL: vector_nest_reduction
- gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+ gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@@ -195,7 +194,7 @@ gpu.module @test {
// CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
// CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
// CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
- gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+ gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 98d78997b95b3..d049a1e57acf2 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -753,11 +753,11 @@ gpu.module @test {
// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} 0.000000e+00 : f16
+// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16
// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
-// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} true : i1
-// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} 1 : index
-// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : f16, memref<16xf16>, index, i1
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
%cst = arith.constant dense<true> : vector<1xi1>
%0 = vector.step : vector<1xindex>
@@ -765,7 +765,7 @@ func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16x
%2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
%cst_0 = arith.constant 0.000000e+00 : f16
%4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
- %cst_2 = arith.constant true : i1
+ %cst_2 = arith.constant true
%cst_3 = arith.constant 1 : index
xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
return
>From 9825526aa9368422a1357861d19df82bdfb37764 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 23:28:29 +0000
Subject: [PATCH 4/8] fix bugj
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 4 ++--
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 5 +++--
mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 3 ++-
.../lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 8 ++------
4 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d87d254158c27..e001419257d8f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
: vector<128x128xf16>
```
}];
- let arguments = (ins XeGPU_ValueOrScalarType: $source,
+ let arguments = (ins XeGPU_VectorOrScalarType: $source,
DistributeLayoutAttr: $input_layout,
DistributeLayoutAttr: $target_layout);
- let results = (outs XeGPU_ValueOrScalarType: $result);
+ let results = (outs XeGPU_VectorOrScalarType: $result);
let assemblyFormat = [{
$source prop-dict attr-dict `:` type($source)
}];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f8b0445e3e2a5..7e142b20c0894 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -26,8 +26,9 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_ValueOrScalarType
- : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
+def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
+def XeGPU_VectorOrScalarType
+ : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
def XeGPU_GatherScatterBaseAddrType
: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index cd5ac6567e0bc..67d48ee6a105e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -487,7 +487,8 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
DenseI32ArrayAttr orderAttr =
consumerLayout ? consumerLayout.getOrder() : nullptr;
SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
- int remainingSgCount = numSg;
+ int remainingSgCount =
+ consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
int consumerIdx = 0;
// First pass: Match consumer's layout on non-reduction dimensions
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index cbc590e71f1e5..fe22a46bcebf4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -653,12 +653,8 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
int numSg = 0;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
- if (failed(numSgOrErr)) {
- reduction.emitWarning(
- "Unable to determine the number of subgroups for the operation.");
- return;
- }
- numSg = numSgOrErr.value();
+ if (succeeded(numSgOrErr))
+ numSg = numSgOrErr.value();
}
// The result layout represents the layout requirements of the operation.
>From d7b4933fc0cbfda35c3de987350ff808a33065ba Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 23:38:20 +0000
Subject: [PATCH 5/8] add test
---
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index d049a1e57acf2..26936dab2fb38 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -772,6 +772,28 @@ func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16x
}
}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>} : vector<16xf16> into f16
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
+func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<16xi1>
+ %0 = vector.step : vector<16xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+ %4 = vector.reduction <add>, %1: vector<16xf16> into f16
+ %cst_2 = arith.constant true
+ %cst_3 = arith.constant 1 : index
+ xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+ return
+ }
+}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(
>From 82dfeffd7eb2cf0c6218fba0a29b9d0d10a0a1e7 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 30 Mar 2026 22:29:04 +0000
Subject: [PATCH 6/8] fixing issues
---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 29 +++++++++----------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 8 ++---
2 files changed, 18 insertions(+), 19 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 67d48ee6a105e..b7dbfc0defc92 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -451,10 +451,6 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
int srcRank = srcShape.size();
auto context = srcVecTy.getContext();
- // Reduction layout requires at least 2D tensors
- if (srcRank < 2)
- return nullptr;
-
// Helper lambda to convert int64 vectors to int32 DenseArrayAttr
auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
SmallVector<int32_t> vec32(vec.begin(), vec.end());
@@ -532,16 +528,18 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
SmallVector<int64_t> instData(srcRank, 1);
instData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
- instData[srcRank - 1] =
- std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
+ if (srcRank >= 2)
+ instData[srcRank - 1] =
+ std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
laneLayout[srcRank - 1] =
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
- laneData[srcRank - 2] =
- std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+ if (srcRank >= 2)
+ laneData[srcRank - 2] =
+ std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
toInt32Attr(laneData));
}
@@ -688,13 +686,14 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
}
} else if (layoutKind == xegpu::LayoutKind::Lane) {
for (int dim = 0; dim < srcRank; dim++) {
- assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
- "srcShape must be divisible by laneLayout for all dimensions");
- laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
- consumerLaneData[dim]);
-
- requiredResLayout =
- requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
+ if (consumerLaneData[dim] != srcShape[dim]) {
+ assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
+ "srcShape must be divisible by laneLayout for all dimensions");
+ laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
+ consumerLaneData[dim]);
+ requiredResLayout =
+ requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
+ }
}
}
return requiredResLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index fe22a46bcebf4..a35f94e7be561 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -637,12 +637,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
LayoutInfo resLayoutInfo = results[0]->getValue();
xegpu::DistributeLayoutAttr consumerLayoutAttr;
- if (!resultTy.isIntOrFloat())
+ if (!resultTy.isIntOrFloat()) {
if (!resLayoutInfo.isAssigned())
return;
- else
- consumerLayoutAttr =
- dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ }
VectorType sourceTy = reduction.getSourceVectorType();
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
>From b57ac5b1812078d98eb04bed9c9116fb40633adf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 1 Apr 2026 18:22:48 +0000
Subject: [PATCH 7/8] address feedback
---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 32 +++++++++----------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 15 ++++-----
2 files changed, 21 insertions(+), 26 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b7dbfc0defc92..55cd6ec04970c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -462,7 +462,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
xegpu::DistributeLayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
xegpu::SliceAttr consumerSliceLayout =
- consumerLayout ? dyn_cast<xegpu::SliceAttr>(consumerLayout) : nullptr;
+ dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
if (consumerSliceLayout &&
consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
srcLayout = consumerSliceLayout.getParent();
@@ -526,11 +526,11 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
} else if (layoutKind == xegpu::LayoutKind::InstData) {
SmallVector<int64_t> instData(srcRank, 1);
- instData[srcRank - 2] =
- std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
if (srcRank >= 2)
- instData[srcRank - 1] =
- std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
+ instData[srcRank - 2] =
+ std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+ instData[srcRank - 1] =
+ std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
} else if (layoutKind == xegpu::LayoutKind::Lane) {
@@ -561,11 +561,11 @@ xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
xegpu::LayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
- assert(true &&
- "subgroup layout assignment not supported for insertStridedSlice.");
+ assert(true && "subgroup layout assignment not supported for reduction (op "
+ "is not expected at this level).");
} else if (layoutKind == xegpu::LayoutKind::InstData) {
- assert(true &&
- "instData layout assignment not supported for insertStridedSlice.");
+ assert(true && "instData layout assignment not supported for reduction (op "
+ "is not expected at this level).");
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int32_t> laneLayout(1), laneData(1);
laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
@@ -686,14 +686,12 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
}
} else if (layoutKind == xegpu::LayoutKind::Lane) {
for (int dim = 0; dim < srcRank; dim++) {
- if (consumerLaneData[dim] != srcShape[dim]) {
- assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
- "srcShape must be divisible by laneLayout for all dimensions");
- laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
- consumerLaneData[dim]);
- requiredResLayout =
- requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
- }
+ assert(srcShape[dim] % consumerLaneLayout[dim] == 0 &&
+ "srcShape must be divisible by laneLayout for all dimensions");
+ laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
+ consumerLaneData[dim]);
+ requiredResLayout =
+ requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
}
}
return requiredResLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index a35f94e7be561..95a4a33db48df 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1340,7 +1340,7 @@ struct ResolveLayoutConflicts {
OpBuilder builder;
LogicalResult resolveTensorDescConsumer(OpOperand &operand);
LogicalResult resolveVectorConsumer(OpOperand &operand);
- LogicalResult assignScalarResultLayout(OpResult &result);
+ LogicalResult assignResultLayout(OpResult &result);
};
} // namespace
@@ -1355,7 +1355,7 @@ LogicalResult ResolveLayoutConflicts::run() {
if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
for (OpResult result : op->getResults()) {
if (result.getType().isIntOrFloat()) {
- auto res = assignScalarResultLayout(result);
+ auto res = assignResultLayout(result);
if (failed(res)) {
DBGS() << "Failed to resolve vector consumer for multi-reduction "
<< *op << "\n";
@@ -1391,17 +1391,14 @@ LogicalResult ResolveLayoutConflicts::run() {
return r.wasInterrupted() ? failure() : success();
}
-LogicalResult
-ResolveLayoutConflicts::assignScalarResultLayout(OpResult &result) {
- Operation *ProducerOp = result.getDefiningOp();
- // Get the current layout of the vector value.
+LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
+ Operation *producerOp = result.getDefiningOp();
auto producerLayout = xegpu::getDistributeLayoutAttr(result);
- // Insert a convert_layout op to resolve the conflict.
+ // Insert a convert_layout op to assign the layout.
builder.setInsertionPointAfterValue(result);
auto convertOp = xegpu::ConvertLayoutOp::create(
- builder, ProducerOp->getLoc(), result.getType(), result, producerLayout,
+ builder, producerOp->getLoc(), result.getType(), result, producerLayout,
producerLayout);
- // Update the users to use the converted value.
result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
return success();
}
>From 391a0f40796ddc4daca181eeb99d97fb34689626 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 1 Apr 2026 18:57:01 +0000
Subject: [PATCH 8/8] remove debug print
---
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 18 +-----------------
1 file changed, 1 insertion(+), 17 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 95a4a33db48df..4c30dacae8850 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -680,37 +680,21 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
void LayoutInfoPropagation::visitVectorReductionOp(
vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- // The layout of the result must be present.
VectorType sourceTy = reduction.getSourceVectorType();
-
- LLVM_DEBUG(DBGS() << "visitVectorReductionOp: " << reduction << "\n");
- LLVM_DEBUG(DBGS() << " sourceTy: " << sourceTy << "\n");
-
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
if (!uArch)
return;
auto requiredResLayoutAttr =
xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
-
- LLVM_DEBUG(DBGS() << " requiredResLayoutAttr: " << requiredResLayoutAttr
- << "\n");
-
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
- // derive the source layout from the dominant layout and reduction dims
auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
-
- LLVM_DEBUG(DBGS() << " srcLayoutAttr: " << srcLayoutAttr << "\n");
-
propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
-
- if (reduction.getAcc()) {
- // Accumulator should have the same layout as the result.
+ if (reduction.getAcc())
propagateIfChanged(operands[1],
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
- }
}
void LayoutInfoPropagation::visitVectorBroadCastOp(
More information about the Mlir-commits
mailing list