[Mlir-commits] [mlir] [mlir][xegpu] Add support for `vector.reduction` and `vector.multi_reduction` subgroup to work-item distribution. (PR #180308)
Charitha Saumya
llvmlistbot at llvm.org
Fri Feb 13 11:38:19 PST 2026
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/180308
>From 4e252eea5664746ee2a10c27ae0539b4b9e0b283 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 3 Feb 2026 00:28:12 +0000
Subject: [PATCH 01/17] save work
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 3 +
.../XeGPUSgToWiDistributeExperimental.cpp | 65 ++++++++++++++++++-
.../Transforms/XeGPUSubgroupDistribute.cpp | 17 +----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 15 +++++
.../XeGPU/sg-to-wi-experimental-unit.mlir | 28 ++++++++
5 files changed, 110 insertions(+), 18 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 700db5f9dd9be..5d54bbc5aca66 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -129,6 +129,9 @@ SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);
+Value subgroupReduction(Location loc, OpBuilder &builder, Value input,
+ vector::CombiningKind kind, uint32_t size);
+
/// Helper Function to find a proper instruction multiple for the user-supplied
/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 8e530642d9c7a..9a2d72d7db6a4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -23,6 +23,7 @@
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/LogicalResult.h"
@@ -362,6 +363,59 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
}
};
+struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
+ using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::ReductionOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
+
+ // If no layout, nothing to do.
+ if (!layout || !layout.isForSubgroup())
+ return failure();
+
+ VectorType vectorType = op.getSourceVectorType();
+
+ // Only rank 1 vectors supported.
+ if (vectorType.getRank() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Only rank 1 reductions can be distributed.");
+ // Lane layout must have the same rank as the vector.
+ if (layout.getRank() != vectorType.getRank())
+ return rewriter.notifyMatchFailure(
+ op, "Layout rank does not match vector rank.");
+
+ // Get the subgroup size from the layout.
+ int64_t sgSize = layout.getEffectiveLaneLayoutAsInt()[0];
+
+ // Only subgroup-sized vectors supported.
+ if (vectorType.getShape()[0] % sgSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "Reduction vector dimension must match subgroup size.");
+
+ if (!op.getType().isIntOrFloat())
+ return rewriter.notifyMatchFailure(
+ op, "Reduction distribution currently only supports floats and "
+ "integer types.");
+
+ // Get the distributed vector (per work-item portion).
+ Value laneValVec = adaptor.getVector();
+
+ // Distribute and reduce across work-items in the subgroup.
+ Value fullReduce = xegpu::subgroupReduction(
+ op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);
+
+ // If there's an accumulator, combine it with the reduced value.
+ if (adaptor.getAcc())
+ fullReduce = vector::makeArithReduction(
+ rewriter, op.getLoc(), op.getKind(), fullReduce, adaptor.getAcc());
+
+ rewriter.replaceOp(op, fullReduce);
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -551,8 +605,15 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
}
return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
});
+ // VectorReductionOp is legal only if its source has no distribute layout
+ // attribute.
+ target.addDynamicallyLegalOp<vector::ReductionOp>(
+ [=](vector::ReductionOp op) -> bool {
+ auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
+ return !layout;
+ });
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
- SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
- typeConverter, patterns.getContext());
+ SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
+ SgToWiVectorReduction>(typeConverter, patterns.getContext());
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a8ed5a289f84a..11e3bc8eb1b27 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -2112,23 +2112,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
int64_t warpSz) { return Value(); };
- auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
- vector::CombiningKind kind, uint32_t size) {
- // First reduce on a single thread to get per lane reduction value.
- Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
- // Parallel reduction using butterfly shuffles.
- for (uint64_t i = 1; i < size; i <<= 1) {
- Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
- /*width=*/size,
- /*mode=*/gpu::ShuffleMode::XOR)
- .getShuffleResult();
- laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
- }
- return laneVal;
- };
-
vector::populateDistributeReduction(
- patterns, warpReduction,
+ patterns, xegpu::subgroupReduction,
/*pattern benefit=*/PatternHierarchy::Regular);
vector::populatePropagateWarpVectorDistributionPatterns(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 7e28c756f2d72..ff19a881c7bbc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -756,6 +756,21 @@ int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
return largest;
}
+Value xegpu::subgroupReduction(Location loc, OpBuilder &builder, Value input,
+ vector::CombiningKind kind, uint32_t size) {
+ // First reduce on a single thread to get per lane reduction value.
+ Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
+ // Parallel reduction using butterfly shuffles.
+ for (uint64_t i = 1; i < size; i <<= 1) {
+ Value shuffled =
+ gpu::ShuffleOp::create(builder, loc, laneVal, i, /** width = **/ size,
+ /** mode = **/ gpu::ShuffleMode::XOR)
+ .getShuffleResult();
+ laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
+ }
+ return laneVal;
+};
+
/// Explicit instantiations
template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
ArrayRef<int> candidateMultiples);
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 0e9843f4626d4..aea4defd0cde5 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -149,4 +149,32 @@ gpu.func @prefetch_nd() {
: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @vector_reduction
+// CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[LANE_RED:.*]] = vector.reduction <add>, %[[CAST:.*]] : vector<2xf32> into f32
+// CHECK: %[[C16_1:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE1:.*]], %{{.*}} = gpu.shuffle xor %[[LANE_RED]], %[[C1]], %[[C16_1]] : f32
+// CHECK: %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1]] : f32
+// CHECK: %[[C16_2:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD1]], %[[C2]], %[[C16_2]] : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2]] : f32
+// CHECK: %[[C16_3:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %[[C4]], %[[C16_3]] : f32
+// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK: %[[C16_4:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %[[C8]], %[[C16_4]] : f32
+// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
+gpu.func @vector_reduction() {
+ %acc = arith.constant 1.0 : f32
+ %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : () -> vector<32xf32>
+ %2 = vector.reduction <add>, %0, %acc : vector<32xf32> into f32
+ gpu.return
+}
+
}
>From b8607418e603e7b0d6b482cb965b356a297d67e5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 3 Feb 2026 00:36:40 +0000
Subject: [PATCH 02/17] save work
---
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 5 +++++
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 4 ++++
2 files changed, 9 insertions(+)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 5d54bbc5aca66..6eaad8a499986 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -129,6 +129,11 @@ SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);
+/// Given an `input` value representing per-lane data, this function returns the
+/// result after performing a reduction on the input over all lanes (number of
+/// lanes given by `size`). This uses butterfly shuffles to perform the
+/// reduction in a log2(size) number of steps.
+/// NOTE: Implementation taken from TestVectorTransforms.cpp
Value subgroupReduction(Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind, uint32_t size);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 9a2d72d7db6a4..5fd75e205bf18 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -363,6 +363,10 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
}
};
+/// This pattern distributes a subgroup-level vector.reduction op to
+/// workitem-level. This require shuffling the data across the workitems (using
+/// gpu::ShuffleOp) and reducing in stages until all workitems have the final
+/// result.
struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
>From 522c9858e1b424f33ffcd5800956a173ac2f779f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 5 Feb 2026 00:47:50 +0000
Subject: [PATCH 03/17] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 5 +
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 10 ++
.../XeGPUSgToWiDistributeExperimental.cpp | 92 +++++++++++++++++++
.../Transforms/XeGPUSubgroupDistribute.cpp | 67 +-------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 61 ++++++++++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 37 +++++++-
6 files changed, 206 insertions(+), 66 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index fede329990be4..ce75be245ffc4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -82,6 +82,11 @@ void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target);
+/// Appends patterns to rewrite vector::MultiDimReductionOp in terms of
+/// vector::ReductionOps if the multi-reduction involves cross-lane data
+/// movement.
+void populateXeGPUSgToWiRewriteVectorMultiReductionToVectorReductionPatterns(
+ RewritePatternSet &patterns);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6eaad8a499986..45a78f64eb869 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -137,6 +137,16 @@ SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
Value subgroupReduction(Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind, uint32_t size);
+/// Given a `src` and an `acc` argumments from a vector::MultiDimReductionOp,
+/// lower to a set of vector::ReductionOp ops over 1D slices extracted from
+/// `src`. The reduction is performed along `reductionDim`. The result is a
+/// vector with the same shape as `acc`.
+/// TODO: Only 2D to 1D reduction is supported for now.
+Value lowerToVectorReductions(TypedValue<VectorType> src,
+ TypedValue<VectorType> acc,
+ vector::CombiningKind kind, int64_t reductionDim,
+ Location loc, PatternRewriter &rewriter);
+
/// Helper Function to find a proper instruction multiple for the user-supplied
/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 5fd75e205bf18..1eceff48348e2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -420,6 +420,92 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
}
};
+class SgToWiRewriteVectorMultiReduction
+ : public OpConversionPattern<vector::MultiDimReductionOp> {
+ using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto layout = xegpu::getDistributeLayoutAttr(op.getSource());
+
+ // If no layout, nothing to do.
+ if (!layout || !layout.isForSubgroup())
+ return failure();
+
+ VectorType sourceType = op.getSourceVectorType();
+ // Only 2D vectors are supported.
+ if (sourceType.getRank() != 2)
+ return rewriter.notifyMatchFailure(op,
+ "Only 2D reductions are supported.");
+
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ // Only 1 reduction dimension supported.
+ if (reductionDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Only 1 reduction dimension is supported.");
+
+ int64_t reductionDim = reductionDims[0];
+
+ // Get the distributed source type based on layout.
+ FailureOr<VectorType> sourceDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, sourceType);
+ if (failed(sourceDistTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to distribute the source vector type.");
+
+ VectorType sourceDistType = sourceDistTypeOrFailure.value();
+
+ // Determine which dimension is distributed.
+ bool dim0Distributed =
+ sourceDistType.getShape()[0] != sourceType.getShape()[0];
+ bool dim1Distributed =
+ sourceDistType.getShape()[1] != sourceType.getShape()[1];
+
+ if (dim0Distributed && dim1Distributed)
+ return rewriter.notifyMatchFailure(
+ op, "Expecting source to be distributed in a single dimension.");
+
+ int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
+ if (sourceDistDim == -1)
+ return rewriter.notifyMatchFailure(
+ op, "Expecting a distributed source vector.");
+
+ // Check if reduction is lane-local or requires cross-lane shuffling.
+ // Lane-local: reduction dimension is NOT the distributed dimension.
+ // Cross-lane: reduction dimension IS the distributed dimension.
+ bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
+ (sourceDistDim == 1 && reductionDim == 0);
+
+ // Get the distributed source and accumulator.
+ Value distributedSource = adaptor.getSource();
+ Value distributedAcc = adaptor.getAcc();
+
+ if (isReductionLaneLocal) {
+ // For lane-local reduction, each lane reduces its local data.
+ // Use lowerToVectorReductions to create a sequence of 1D reductions.
+ Value result = xegpu::lowerToVectorReductions(
+ cast<TypedValue<VectorType>>(distributedSource),
+ cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
+ reductionDim, op.getLoc(), rewriter);
+
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+
+ // For non-lane-local (cross-lane) reduction, we also lower to a sequence
+ // of 1D reductions. The individual 1D reductions will be handled by
+ // SgToWiVectorReduction pattern which performs the cross-lane shuffles.
+ Value result = xegpu::lowerToVectorReductions(
+ cast<TypedValue<VectorType>>(distributedSource),
+ cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
+ reductionDim, op.getLoc(), rewriter);
+
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -621,3 +707,9 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
SgToWiVectorReduction>(typeConverter, patterns.getContext());
}
+
+void xegpu::
+ populateXeGPUSgToWiRewriteVectorMultiReductionToVectorReductionPatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<SgToWiRewriteVectorMultiReduction>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 11e3bc8eb1b27..346c31b5a8a7e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1205,69 +1205,6 @@ struct SinkUniformOps final : public gpu::WarpDistributionPattern {
}
};
-/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
-/// VectorReductionOps. We also insert layouts for the newly created ops.
-static Value lowerToVectorReductions(TypedValue<VectorType> src,
- TypedValue<VectorType> acc,
- vector::CombiningKind kind,
- int64_t reductionDim, Location loc,
- PatternRewriter &rewriter) {
- // Expecting a 2D source vector.
- assert(src.getType().getRank() == 2 && "expected a 2D source vector");
- VectorType sourceType = src.getType();
- int64_t sourceH = sourceType.getShape()[0];
- int64_t sourceW = sourceType.getShape()[1];
- int nSlices = (reductionDim == 0) ? sourceW : sourceH;
- // Create a constant vector to hold the result of the reduction.
- TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
- Value reductionResult = arith::ConstantOp::create(
- rewriter, loc, acc.getType(),
- DenseElementsAttr::get(acc.getType(), zeroAttr));
- // Reduction result should have the same layout as the accumulator.
- xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
- xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
- // For each slice of the source, extract the slice vector, do a reduction
- // and, insert the reduced value back to the result vector.
- for (int i = 0; i < nSlices; ++i) {
- SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
- if (reductionDim == 1) {
- sliceOffsets = {i, 0};
- sliceSizes = {1, sourceW};
- } else {
- sliceOffsets = {0, i};
- sliceSizes = {sourceH, 1};
- }
- vector::ExtractStridedSliceOp extractOp =
- vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
- sliceSizes, {1, 1});
-
- int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
-
- vector::ShapeCastOp slice = vector::ShapeCastOp::create(
- rewriter, loc,
- VectorType::get({nSliceElements}, sourceType.getElementType()),
- extractOp.getResult());
-
- // Shape cast is currently handled in xegpu side. So layouts must be
- // retained during lowering. Shape cast output has the same layout as the
- // accumulator. Shape cast source has the same layout as the original
- // reduction source.
- // TODO: other ops generated here may also need layout attributes.
- auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
- auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
-
- xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
- xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
- // Extract and reduction results in scalars, so no result layout is needed.
- Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
- Value reduction = vector::ReductionOp::create(
- rewriter, loc, kind, slice.getResult(), accExtract);
- reductionResult =
- vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
- }
- return reductionResult;
-}
-
/// This patterns distribute the `vector.multi_reduction` operation across
/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
/// layouts for the source and accumulator vectors,
@@ -1405,7 +1342,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
{sourceDistType, distributedResultType}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- Value result = lowerToVectorReductions(
+ Value result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
@@ -1417,7 +1354,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
// of multiple ReductionOps. Actual distribution is done by the
// WarpOpReduction pattern.
rewriter.setInsertionPointAfter(reductionOp);
- Value result = lowerToVectorReductions(
+ Value result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(reductionOp.getSource()),
cast<TypedValue<VectorType>>(reductionOp.getAcc()),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index ff19a881c7bbc..c4bab915d9e64 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -771,6 +771,67 @@ Value xegpu::subgroupReduction(Location loc, OpBuilder &builder, Value input,
return laneVal;
};
+Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
+ TypedValue<VectorType> acc,
+ vector::CombiningKind kind,
+ int64_t reductionDim, Location loc,
+ PatternRewriter &rewriter) {
+ // Expecting a 2D source vector.
+ assert(src.getType().getRank() == 2 && "expected a 2D source vector");
+ VectorType sourceType = src.getType();
+ int64_t sourceH = sourceType.getShape()[0];
+ int64_t sourceW = sourceType.getShape()[1];
+ int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+ // Create a constant vector to hold the result of the reduction.
+ TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
+ Value reductionResult = arith::ConstantOp::create(
+ rewriter, loc, acc.getType(),
+ DenseElementsAttr::get(acc.getType(), zeroAttr));
+ // Reduction result should have the same layout as the accumulator.
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
+ // For each slice of the source, extract the slice vector, do a reduction
+ // and, insert the reduced value back to the result vector.
+ for (int i = 0; i < nSlices; ++i) {
+ SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
+ if (reductionDim == 1) {
+ sliceOffsets = {i, 0};
+ sliceSizes = {1, sourceW};
+ } else {
+ sliceOffsets = {0, i};
+ sliceSizes = {sourceH, 1};
+ }
+ vector::ExtractStridedSliceOp extractOp =
+ vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
+ sliceSizes, {1, 1});
+
+ int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+
+ vector::ShapeCastOp slice = vector::ShapeCastOp::create(
+ rewriter, loc,
+ VectorType::get({nSliceElements}, sourceType.getElementType()),
+ extractOp.getResult());
+
+ // Shape cast is currently handled in xegpu side. So layouts must be
+ // retained during lowering. Shape cast output has the same layout as the
+ // accumulator. Shape cast source has the same layout as the original
+ // reduction source.
+ // TODO: other ops generated here may also need layout attributes.
+ auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+
+ xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+ xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
+ // Extract and reduction results in scalars, so no result layout is needed.
+ Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
+ Value reduction = vector::ReductionOp::create(
+ rewriter, loc, kind, slice.getResult(), accExtract);
+ reductionResult =
+ vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
+ }
+ return reductionResult;
+}
+
/// Explicit instantiations
template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
ArrayRef<int> candidateMultiples);
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 405e974500e08..bddb06e92da66 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -272,6 +272,12 @@ struct TestXeGPUSgToWiDistributeExperimental
"Work-item Distribution";
}
+ Option<bool> enablePartialReductionLowering{
+ *this, "enable-partial-multi-reduction-lowering",
+ llvm::cl::desc(
+ "Partially lower multi-reduction ops to vector.reduction and stop."),
+ llvm::cl::init(false)};
+
void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
registry.insert<arith::ArithDialect>();
registry.insert<memref::MemRefDialect>();
@@ -283,7 +289,8 @@ struct TestXeGPUSgToWiDistributeExperimental
TestXeGPUSgToWiDistributeExperimental() = default;
TestXeGPUSgToWiDistributeExperimental(
- const TestXeGPUSgToWiDistributeExperimental &pass) = default;
+ const TestXeGPUSgToWiDistributeExperimental &pass)
+ : PassWrapper(pass) {}
void runOnOperation() override {
MLIRContext *ctx = &getContext();
@@ -297,6 +304,34 @@ struct TestXeGPUSgToWiDistributeExperimental
};
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
+
+ // If `enablePartialReductionLowering` is set, only focus on testing the
+ // partial lowering of vector::MultiReductionOp.
+ if (enablePartialReductionLowering) {
+ xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+ ConversionTarget target(*ctx);
+ RewritePatternSet patterns(ctx);
+ xegpu::populateXeGPUSgToWiRewriteMultiReductionToReductionPatterns(
+ patterns);
+ // Mark 2D to 1D vector::MultiDimReductionOp as illegal.
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [&](vector::MultiDimReductionOp op) {
+ int64_t sourceRank = op.getSourceVectorType().getRank();
+ VectorType resultType =
+ dyn_cast<VectorType>(op.getResult().getType());
+ if (!resultType)
+ return true;
+ int64_t resultRank = resultType.getRank();
+ return sourceRank != 2 || resultRank != 1;
+ });
+ // vector::ReductionOp is legal.
+ target.addDynamicallyLegalOp<vector::ReductionOp>(
+ [&](vector::ReductionOp op) { return true; });
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+ (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+ return;
+ }
+
ConversionTarget target(*ctx);
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
>From 1955d3c04b5a19d8c862835d33435632103c5092 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 7 Feb 2026 00:55:00 +0000
Subject: [PATCH 04/17] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 6 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 7 +
.../XeGPUSgToWiDistributeExperimental.cpp | 211 +++++++++++++-----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 20 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 98 ++++++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 33 +--
6 files changed, 276 insertions(+), 99 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index ce75be245ffc4..9967c4b2740bc 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -85,8 +85,8 @@ void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
/// Appends patterns to rewrite vector::MultiDimReductionOp in terms of
/// vector::ReductionOps if the multi-reduction involves cross-lane data
/// movement.
-void populateXeGPUSgToWiRewriteVectorMultiReductionToVectorReductionPatterns(
- RewritePatternSet &patterns);
+void populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
+ RewritePatternSet &patterns, ConversionTarget &target);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
@@ -98,7 +98,7 @@ void populateXeGPUSgToWiRewriteVectorMultiReductionToVectorReductionPatterns(
/// 1. the unrolled type `unrolledType` and number of unrolled instances
/// `numUnrolledInstances` are computed from the `targetShape`.
/// 2. pack each operand. ExtractStridedSlice are created to break-up the
-/// vector operands. And BuiltinUnrealizedCastop are created to break-up
+/// vector operands. And BuiltinUnrealizedCastOp are created to break-up
/// the TensorDesc operands.
/// 3. the original op is cloned `numUnrolledInstances` times, once for each
/// result.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index faafb7e8cee61..78f56cda281f0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -840,6 +840,9 @@ void LayoutInfoPropagation::visitDpasOp(
// Step 1. Get all valid layouts for A, B, and C operands.
// All operands must have at least one valid subgroup layout.
LayoutInfo layoutD = results[0]->getValue();
+ llvm::errs() << "DPAS result layout: ";
+ layoutD.print(llvm::errs());
+ llvm::errs() << "\n";
SmallVector<int> sgLayoutD = layoutD.getSgLayout();
assert(!sgLayoutD.empty() && "Expected layout for DPAS result.");
auto layoutDVal = std::make_pair(sgLayoutD[0], sgLayoutD[1]);
@@ -994,6 +997,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
"Unable to determine the number of subgroups for the operation.");
return;
}
+ llvm::errs() << "Num SG: " << numSgOrErr.value() << "\n";
auto sgLayouts = getValidLayouts(store.getValueType().getShape(),
instData, numSgOrErr.value());
if (sgLayouts.empty()) {
@@ -1011,6 +1015,9 @@ void LayoutInfoPropagation::visitStoreNdOp(
DenseI32ArrayAttr::get(dataTy.getContext(), sgData),
/*inst_data =*/nullptr, /*lane_layout =*/nullptr,
/*lane_data =*/nullptr, /*order =*/nullptr));
+ llvm::errs() << "Chosen SG Layout:";
+ storeLayout.print(llvm::errs());
+ llvm::errs() << "\n";
}
store.setLayoutAttr(
dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 1eceff48348e2..f116b20eea4aa 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -420,24 +420,121 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
}
};
-class SgToWiRewriteVectorMultiReduction
+// class SgToWiRewriteVectorMultiReduction
+// : public OpConversionPattern<vector::MultiDimReductionOp> {
+// using
+// OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
+
+// LogicalResult
+// matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
+// ConversionPatternRewriter &rewriter) const override {
+// auto layout = xegpu::getDistributeLayoutAttr(op.getSource());
+
+// // If no layout, nothing to do.
+// if (!layout || !layout.isForSubgroup())
+// return failure();
+
+// VectorType sourceType = op.getSourceVectorType();
+// // Only 2D vectors are supported.
+// if (sourceType.getRank() != 2)
+// return rewriter.notifyMatchFailure(op,
+// "Only 2D reductions are
+// supported.");
+
+// ArrayRef<int64_t> reductionDims = op.getReductionDims();
+// // Only 1 reduction dimension supported.
+// if (reductionDims.size() != 1)
+// return rewriter.notifyMatchFailure(
+// op, "Only 1 reduction dimension is supported.");
+
+// int64_t reductionDim = reductionDims[0];
+
+// // Get the distributed source type based on layout.
+// FailureOr<VectorType> sourceDistTypeOrFailure =
+// getDistVecTypeBasedOnLaneLayout(layout, sourceType);
+// if (failed(sourceDistTypeOrFailure))
+// return rewriter.notifyMatchFailure(
+// op, "Failed to distribute the source vector type.");
+
+// VectorType sourceDistType = sourceDistTypeOrFailure.value();
+
+// // Determine which dimension is distributed.
+// bool dim0Distributed =
+// sourceDistType.getShape()[0] != sourceType.getShape()[0];
+// bool dim1Distributed =
+// sourceDistType.getShape()[1] != sourceType.getShape()[1];
+
+// if (dim0Distributed && dim1Distributed)
+// return rewriter.notifyMatchFailure(
+// op, "Expecting source to be distributed in a single dimension.");
+
+// int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
+// if (sourceDistDim == -1)
+// return rewriter.notifyMatchFailure(
+// op, "Expecting a distributed source vector.");
+
+// // Check if reduction is lane-local or requires cross-lane shuffling.
+// // Lane-local: reduction dimension is NOT the distributed dimension.
+// // Cross-lane: reduction dimension IS the distributed dimension.
+// bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
+// (sourceDistDim == 1 && reductionDim == 0);
+
+// // Get the distributed source and accumulator.
+// Value distributedSource = adaptor.getSource();
+// Value distributedAcc = adaptor.getAcc();
+
+// if (isReductionLaneLocal) {
+// // For lane-local reduction, each lane reduces its local data.
+// // Use lowerToVectorReductions to create a sequence of 1D reductions.
+// Value result = xegpu::lowerToVectorReductions(
+// cast<TypedValue<VectorType>>(distributedSource),
+// cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
+// reductionDim, op.getLoc(), rewriter);
+
+// rewriter.replaceOp(op, result);
+// return success();
+// }
+
+// // For non-lane-local (cross-lane) reduction, we also lower to a sequence
+// // of 1D reductions. The individual 1D reductions will be handled by
+// // SgToWiVectorReduction pattern which performs the cross-lane shuffles.
+// Value result = xegpu::lowerToVectorReductions(
+// cast<TypedValue<VectorType>>(distributedSource),
+// cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
+// reductionDim, op.getLoc(), rewriter);
+
+// rewriter.replaceOp(op, result);
+// return success();
+// }
+// };
+
+struct LowerVectorMultiReductionPattern
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto layout = xegpu::getDistributeLayoutAttr(op.getSource());
-
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
// If no layout, nothing to do.
- if (!layout || !layout.isForSubgroup())
+ if (!resLayout || !resLayout.isForSubgroup())
return failure();
-
VectorType sourceType = op.getSourceVectorType();
// Only 2D vectors are supported.
if (sourceType.getRank() != 2)
- return rewriter.notifyMatchFailure(op,
- "Only 2D reductions are supported.");
+ return rewriter.notifyMatchFailure(
+ op, "Expecting 2D source vector in vector::MultiDimReductionOp at "
+ "subgroup level.");
+ VectorType resType = dyn_cast<VectorType>(op.getType());
+ if (!resType)
+ return rewriter.notifyMatchFailure(op, "Expecting vector result type");
+ // Compute the distributed vector type based on the layout.
+ FailureOr<VectorType> resDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resType);
+ if (failed(resDistTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to compute the distributed vector type based on the "
+ "layout.");
ArrayRef<int64_t> reductionDims = op.getReductionDims();
// Only 1 reduction dimension supported.
@@ -447,59 +544,19 @@ class SgToWiRewriteVectorMultiReduction
int64_t reductionDim = reductionDims[0];
- // Get the distributed source type based on layout.
- FailureOr<VectorType> sourceDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, sourceType);
- if (failed(sourceDistTypeOrFailure))
- return rewriter.notifyMatchFailure(
- op, "Failed to distribute the source vector type.");
-
- VectorType sourceDistType = sourceDistTypeOrFailure.value();
-
- // Determine which dimension is distributed.
- bool dim0Distributed =
- sourceDistType.getShape()[0] != sourceType.getShape()[0];
- bool dim1Distributed =
- sourceDistType.getShape()[1] != sourceType.getShape()[1];
-
- if (dim0Distributed && dim1Distributed)
- return rewriter.notifyMatchFailure(
- op, "Expecting source to be distributed in a single dimension.");
-
- int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
- if (sourceDistDim == -1)
- return rewriter.notifyMatchFailure(
- op, "Expecting a distributed source vector.");
-
- // Check if reduction is lane-local or requires cross-lane shuffling.
- // Lane-local: reduction dimension is NOT the distributed dimension.
- // Cross-lane: reduction dimension IS the distributed dimension.
- bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
- (sourceDistDim == 1 && reductionDim == 0);
-
- // Get the distributed source and accumulator.
- Value distributedSource = adaptor.getSource();
- Value distributedAcc = adaptor.getAcc();
-
- if (isReductionLaneLocal) {
- // For lane-local reduction, each lane reduces its local data.
- // Use lowerToVectorReductions to create a sequence of 1D reductions.
- Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(distributedSource),
- cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
- reductionDim, op.getLoc(), rewriter);
-
- rewriter.replaceOp(op, result);
- return success();
- }
+ // Rewrite is only needed when when the reduction is not lane-local.
+ // If the reduction is lane-local, result type is not the same before and
+ // after distribution (i.e. result is distributed to lanes not shared.)
+ bool reductionDimDistributed =
+ resType.getShape() != resDistTypeOrFailure.value().getShape();
+ if (reductionDimDistributed)
+ return failure();
- // For non-lane-local (cross-lane) reduction, we also lower to a sequence
- // of 1D reductions. The individual 1D reductions will be handled by
- // SgToWiVectorReduction pattern which performs the cross-lane shuffles.
+ // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(distributedSource),
- cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
- reductionDim, op.getLoc(), rewriter);
+ cast<TypedValue<VectorType>>(op.getSource()),
+ cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(), reductionDim,
+ op.getLoc(), rewriter);
rewriter.replaceOp(op, result);
return success();
@@ -708,8 +765,38 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
SgToWiVectorReduction>(typeConverter, patterns.getContext());
}
-void xegpu::
- populateXeGPUSgToWiRewriteVectorMultiReductionToVectorReductionPatterns(
- RewritePatternSet &patterns) {
- patterns.add<SgToWiRewriteVectorMultiReduction>(patterns.getContext());
+void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
+ RewritePatternSet &patterns, ConversionTarget &target) {
+ // vector::MultiDimReductionOp is legal only if the reduction dimension is
+ // not distributed.
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [&](vector::MultiDimReductionOp op) {
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ // If no layout, mark legal.
+ if (!resLayout || !resLayout.isForSubgroup())
+ return true;
+ VectorType resTy = dyn_cast<VectorType>(op.getType());
+ if (!resTy)
+ return true;
+ // Compute the distributed result vector type based on the layout.
+ FailureOr<VectorType> resDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
+ if (failed(resDistTypeOrFailure))
+ return true;
+
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ // Only 1 reduction dimension supported.
+ if (reductionDims.size() != 1)
+ return true;
+ // Op is legal if the reduction dimension is not distributed.
+ // If the reduction dim is distributed, each lane does its own local
+ // reduction and result is distributed. So result types before and after
+ // distribution should not match.
+ return resTy != resDistTypeOrFailure.value();
+ });
+ // vector::ReductionOp is legal.
+ target.addDynamicallyLegalOp<vector::ReductionOp>(
+ [&](vector::ReductionOp op) { return true; });
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+ patterns.add<LowerVectorMultiReductionPattern>(patterns.getContext());
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index c4bab915d9e64..90460d9a5837d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -787,9 +787,10 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
+ auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
// Reduction result should have the same layout as the accumulator.
- xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
- xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
@@ -801,9 +802,12 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
sliceOffsets = {0, i};
sliceSizes = {sourceH, 1};
}
+
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
+ // Extract strided slice has the same layout as src.
+ xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
@@ -812,14 +816,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
- // Shape cast is currently handled in xegpu side. So layouts must be
- // retained during lowering. Shape cast output has the same layout as the
- // accumulator. Shape cast source has the same layout as the original
- // reduction source.
- // TODO: other ops generated here may also need layout attributes.
- auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
- auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
-
+ // Shape cast output has the same layout as the accumulator. Shape cast
+ // source has the same layout as the original reduction source.
xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
// Extract and reduction results in scalars, so no result layout is needed.
@@ -828,6 +826,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
rewriter, loc, kind, slice.getResult(), accExtract);
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
+ // Insert op should have the same layout as the accumulator.
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
}
return reductionResult;
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index aea4defd0cde5..5e2dac9bbb79a 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,6 +2,10 @@
// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --allow-unregistered-dialect \
+// RUN: --test-xegpu-sg-to-wi-distribute-experimental="enable-REWRITE-to-reductions" \
+// RUN: --split-input-file %s | FileCheck --check-prefix=CHECK-REWRITE %s
+
gpu.module @xevm_module {
@@ -177,4 +181,98 @@ gpu.func @vector_reduction() {
gpu.return
}
+
+// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK-REWRITE-DAG: %[[SRC:.*]] = "some_def"() {layout_result_0 =
+// CHECK-REWRITE-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
+// CHECK-REWRITE-DAG: %[[ACC:.*]] = arith.constant
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
+// CHECK-REWRITE-DAG: %[[ZERO:.*]] = arith.constant
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
+// CHECK-REWRITE: %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+// CHECK-REWRITE-SAME: offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
+// CHECK-REWRITE-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
+// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: : vector<1x16xf32> to vector<16xf32>
+// CHECK-REWRITE-NEXT: %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
+// CHECK-REWRITE-NEXT: %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+// CHECK-REWRITE-SAME: offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
+// CHECK-REWRITE-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
+// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: : vector<1x16xf32> to vector<16xf32>
+// CHECK-REWRITE-NEXT: %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
+// CHECK-REWRITE-NEXT: %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %src = "some_def"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> (vector<2x16xf32>)
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+ dense<0.0> : vector<2xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+ }
+ [1] : vector<2x16xf32> to vector<2xf32>
+ gpu.return
+}
+
+// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK-REWRITE-DAG: %[[SRC:.*]] = "some_def"() {layout_result_0 =
+// CHECK-REWRITE-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
+// CHECK-REWRITE-DAG: %[[ACC:.*]] = arith.constant
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
+// CHECK-REWRITE-DAG: %[[ZERO:.*]] = arith.constant
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
+// CHECK-REWRITE: %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-REWRITE-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-REWRITE-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
+// CHECK-REWRITE-SAME: {{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-REWRITE-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK-REWRITE-NEXT: %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
+// CHECK-REWRITE-NEXT: %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-REWRITE-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK-REWRITE-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
+// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
+// CHECK-REWRITE-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK-REWRITE-NEXT: %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
+// CHECK-REWRITE-NEXT: %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
+// CHECK-REWRITE-NEXT: %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
+// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %src = "some_def"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> (vector<16x2xf32>)
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+ dense<0.0> : vector<2xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
+ }
+ [0] : vector<16x2xf32> to vector<2xf32>
+ gpu.return
+}
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index bddb06e92da66..7302865bb61ca 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -272,10 +272,10 @@ struct TestXeGPUSgToWiDistributeExperimental
"Work-item Distribution";
}
- Option<bool> enablePartialReductionLowering{
- *this, "enable-partial-multi-reduction-lowering",
- llvm::cl::desc(
- "Partially lower multi-reduction ops to vector.reduction and stop."),
+ Option<bool> enableRewriteMultiReductionToReductions{
+ *this, "enable-rewrite-multi-reduction-to-reductions",
+ llvm::cl::desc("Partially lower multi-reduction ops to reduction ops if "
+ "the reduction dimension is distributed."),
llvm::cl::init(false)};
void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
@@ -305,29 +305,14 @@ struct TestXeGPUSgToWiDistributeExperimental
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
- // If `enablePartialReductionLowering` is set, only focus on testing the
- // partial lowering of vector::MultiReductionOp.
- if (enablePartialReductionLowering) {
+ // If `enableRewriteMultiReductionToReductions` is set, only focus on
+ // testing the partial lowering of vector::MultiReductionOp.
+ if (enableRewriteMultiReductionToReductions) {
xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
ConversionTarget target(*ctx);
RewritePatternSet patterns(ctx);
- xegpu::populateXeGPUSgToWiRewriteMultiReductionToReductionPatterns(
- patterns);
- // Mark 2D to 1D vector::MultiDimReductionOp as illegal.
- target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
- [&](vector::MultiDimReductionOp op) {
- int64_t sourceRank = op.getSourceVectorType().getRank();
- VectorType resultType =
- dyn_cast<VectorType>(op.getResult().getType());
- if (!resultType)
- return true;
- int64_t resultRank = resultType.getRank();
- return sourceRank != 2 || resultRank != 1;
- });
- // vector::ReductionOp is legal.
- target.addDynamicallyLegalOp<vector::ReductionOp>(
- [&](vector::ReductionOp op) { return true; });
- target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+ xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(patterns,
+ target);
(void)applyPartialConversion(getOperation(), target, std::move(patterns));
return;
}
>From 8d8800a74e4c964b5c594e759f85ba1cc3f59e76 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 7 Feb 2026 01:33:42 +0000
Subject: [PATCH 05/17] save work
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 2 +-
.../XeGPU/subgroup-distribute-unit.mlir | 41 ++++++++++---------
2 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 5e2dac9bbb79a..d28fe51ab91b1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -3,7 +3,7 @@
// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
// RUN: mlir-opt --allow-unregistered-dialect \
-// RUN: --test-xegpu-sg-to-wi-distribute-experimental="enable-REWRITE-to-reductions" \
+// RUN: --test-xegpu-sg-to-wi-distribute-experimental="enable-rewrite-multi-reduction-to-reductions" \
// RUN: --split-input-file %s | FileCheck --check-prefix=CHECK-REWRITE %s
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 81f25cc85359f..95a89e2edc84a 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -288,13 +288,9 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32
-// CHECK-NEXT: %[[T4:.*]] = vector.insert %[[T3]], %cst_0 [0] : f32 into vector<2xf32>
+// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
// CHECK-NEXT: %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %cst : vector<16xf32> into f32
-// CHECK-NEXT: %[[T7:.*]] = vector.insert %[[T6]], %[[T4]] [1] : f32 into vector<2xf32>
-// CHECK-NEXT: gpu.yield %[[T7]]
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -356,20 +352,27 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>{{.*}}) {
-// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: %[[SRC:.*]] = "some_def"()
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME: : () -> vector<16x2xf32>
// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK: %[[T4:.*]] = vector.insert %[[T3]], %cst_0 [0] : f32 into vector<2xf32>
-// CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] {{.*}} : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T7:.*]] = vector.reduction <add>, %[[T6]], %{{.*}} : vector<16xf32> into f32
-// CHECK: %[[T8:.*]] = vector.insert %[[T7]], %[[T4]] [1] : f32 into vector<2xf32>
-// CHECK: gpu.yield %[[T8]]
-// CHECK: }
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]]
+// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
+// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]]
+// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
>From 7e43badadf2d46101be07e45428612e43dee0be5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 7 Feb 2026 01:36:22 +0000
Subject: [PATCH 06/17] save work
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 7 -------
1 file changed, 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 78f56cda281f0..faafb7e8cee61 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -840,9 +840,6 @@ void LayoutInfoPropagation::visitDpasOp(
// Step 1. Get all valid layouts for A, B, and C operands.
// All operands must have at least one valid subgroup layout.
LayoutInfo layoutD = results[0]->getValue();
- llvm::errs() << "DPAS result layout: ";
- layoutD.print(llvm::errs());
- llvm::errs() << "\n";
SmallVector<int> sgLayoutD = layoutD.getSgLayout();
assert(!sgLayoutD.empty() && "Expected layout for DPAS result.");
auto layoutDVal = std::make_pair(sgLayoutD[0], sgLayoutD[1]);
@@ -997,7 +994,6 @@ void LayoutInfoPropagation::visitStoreNdOp(
"Unable to determine the number of subgroups for the operation.");
return;
}
- llvm::errs() << "Num SG: " << numSgOrErr.value() << "\n";
auto sgLayouts = getValidLayouts(store.getValueType().getShape(),
instData, numSgOrErr.value());
if (sgLayouts.empty()) {
@@ -1015,9 +1011,6 @@ void LayoutInfoPropagation::visitStoreNdOp(
DenseI32ArrayAttr::get(dataTy.getContext(), sgData),
/*inst_data =*/nullptr, /*lane_layout =*/nullptr,
/*lane_data =*/nullptr, /*order =*/nullptr));
- llvm::errs() << "Chosen SG Layout:";
- storeLayout.print(llvm::errs());
- llvm::errs() << "\n";
}
store.setLayoutAttr(
dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
>From 3e53afc2e50999209132dff019e2a1ba616aac3b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 7 Feb 2026 01:37:59 +0000
Subject: [PATCH 07/17] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 89 -------------------
1 file changed, 89 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index f116b20eea4aa..6af3eb6eb4ef7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -23,7 +23,6 @@
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/ValueRange.h"
-#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/LogicalResult.h"
@@ -420,94 +419,6 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
}
};
-// class SgToWiRewriteVectorMultiReduction
-// : public OpConversionPattern<vector::MultiDimReductionOp> {
-// using
-// OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
-
-// LogicalResult
-// matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
-// ConversionPatternRewriter &rewriter) const override {
-// auto layout = xegpu::getDistributeLayoutAttr(op.getSource());
-
-// // If no layout, nothing to do.
-// if (!layout || !layout.isForSubgroup())
-// return failure();
-
-// VectorType sourceType = op.getSourceVectorType();
-// // Only 2D vectors are supported.
-// if (sourceType.getRank() != 2)
-// return rewriter.notifyMatchFailure(op,
-// "Only 2D reductions are
-// supported.");
-
-// ArrayRef<int64_t> reductionDims = op.getReductionDims();
-// // Only 1 reduction dimension supported.
-// if (reductionDims.size() != 1)
-// return rewriter.notifyMatchFailure(
-// op, "Only 1 reduction dimension is supported.");
-
-// int64_t reductionDim = reductionDims[0];
-
-// // Get the distributed source type based on layout.
-// FailureOr<VectorType> sourceDistTypeOrFailure =
-// getDistVecTypeBasedOnLaneLayout(layout, sourceType);
-// if (failed(sourceDistTypeOrFailure))
-// return rewriter.notifyMatchFailure(
-// op, "Failed to distribute the source vector type.");
-
-// VectorType sourceDistType = sourceDistTypeOrFailure.value();
-
-// // Determine which dimension is distributed.
-// bool dim0Distributed =
-// sourceDistType.getShape()[0] != sourceType.getShape()[0];
-// bool dim1Distributed =
-// sourceDistType.getShape()[1] != sourceType.getShape()[1];
-
-// if (dim0Distributed && dim1Distributed)
-// return rewriter.notifyMatchFailure(
-// op, "Expecting source to be distributed in a single dimension.");
-
-// int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
-// if (sourceDistDim == -1)
-// return rewriter.notifyMatchFailure(
-// op, "Expecting a distributed source vector.");
-
-// // Check if reduction is lane-local or requires cross-lane shuffling.
-// // Lane-local: reduction dimension is NOT the distributed dimension.
-// // Cross-lane: reduction dimension IS the distributed dimension.
-// bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
-// (sourceDistDim == 1 && reductionDim == 0);
-
-// // Get the distributed source and accumulator.
-// Value distributedSource = adaptor.getSource();
-// Value distributedAcc = adaptor.getAcc();
-
-// if (isReductionLaneLocal) {
-// // For lane-local reduction, each lane reduces its local data.
-// // Use lowerToVectorReductions to create a sequence of 1D reductions.
-// Value result = xegpu::lowerToVectorReductions(
-// cast<TypedValue<VectorType>>(distributedSource),
-// cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
-// reductionDim, op.getLoc(), rewriter);
-
-// rewriter.replaceOp(op, result);
-// return success();
-// }
-
-// // For non-lane-local (cross-lane) reduction, we also lower to a sequence
-// // of 1D reductions. The individual 1D reductions will be handled by
-// // SgToWiVectorReduction pattern which performs the cross-lane shuffles.
-// Value result = xegpu::lowerToVectorReductions(
-// cast<TypedValue<VectorType>>(distributedSource),
-// cast<TypedValue<VectorType>>(distributedAcc), op.getKind(),
-// reductionDim, op.getLoc(), rewriter);
-
-// rewriter.replaceOp(op, result);
-// return success();
-// }
-// };
-
struct LowerVectorMultiReductionPattern
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
>From 80561e563e5da373aee71c2a3aa75d2c4a71664e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 9 Feb 2026 23:48:46 +0000
Subject: [PATCH 08/17] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 81 +++++++++++++++++--
.../XeGPU/sg-to-wi-experimental-unit.mlir | 42 ++++++++++
2 files changed, 118 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 6af3eb6eb4ef7..7c641f788e842 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -419,6 +419,55 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
}
};
+struct SgToWiMultiDimReduction
+ : public OpConversionPattern<vector::MultiDimReductionOp> {
+ using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ // If no layout, nothing to do.
+ if (!resLayout || !resLayout.isForSubgroup())
+ return failure();
+ VectorType sourceType = op.getSourceVectorType();
+ // Only 2D vectors are supported.
+ if (sourceType.getRank() != 2)
+ return rewriter.notifyMatchFailure(
+ op, "Expecting 2D source vector in vector::MultiDimReductionOp at "
+ "subgroup level.");
+ VectorType resVecTy = dyn_cast<VectorType>(op.getType());
+ if (!resVecTy)
+ return rewriter.notifyMatchFailure(op, "Expecting vector result type");
+ // Compute the distributed vector type based on the layout.
+ FailureOr<VectorType> resDistVecTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
+ if (failed(resDistVecTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to compute the distributed vector type based on the "
+ "layout.");
+ // Check if the reduction is single dim.
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ if (reductionDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Only 1 reduction dimension is supported.");
+ // Check if the reduction is lane-local. If not, distributed result type ==
+ // original result type. This case is handled by
+ // `LowerVectorMultiReductionPattern`.
+ if (resVecTy == resDistVecTyOrFailure.value())
+ return rewriter.notifyMatchFailure(
+ op, "Reduction is not lane-local, expected reduction dimension to be "
+ "not distributed.");
+ // Simply create a new MultiDimReductionOp using adaptor operands and the
+ // new result type.
+ auto newOp = vector::MultiDimReductionOp::create(
+ rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
+ adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
struct LowerVectorMultiReductionPattern
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
@@ -663,17 +712,42 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
}
return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
});
- // VectorReductionOp is legal only if its source has no distribute layout
+ // vector::ReductionOp is legal only if its source has no distribute layout
// attribute.
target.addDynamicallyLegalOp<vector::ReductionOp>(
[=](vector::ReductionOp op) -> bool {
auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
return !layout;
});
+ // vector::MultiDimReductionOp is legal only if its result has no distribute
+ // layout attribute.
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [=](vector::MultiDimReductionOp op) -> bool {
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ // If no layout, mark legal.
+ if (!resLayout || !resLayout.isForSubgroup())
+ return true;
+ VectorType resTy = dyn_cast<VectorType>(op.getType());
+ if (!resTy)
+ return true;
+ // Compute the distributed result vector type based on the layout.
+ FailureOr<VectorType> resDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
+ if (failed(resDistTypeOrFailure))
+ return true;
+
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ // Only 1 reduction dimension supported.
+ if (reductionDims.size() != 1)
+ return true;
+ // Op is legal if the reduction dimension is distributed.
+ return resTy == resDistTypeOrFailure.value();
+ });
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiVectorReduction>(typeConverter, patterns.getContext());
+ SgToWiVectorReduction, SgToWiMultiDimReduction>(
+ typeConverter, patterns.getContext());
}
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
@@ -700,9 +774,6 @@ void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
if (reductionDims.size() != 1)
return true;
// Op is legal if the reduction dimension is not distributed.
- // If the reduction dim is distributed, each lane does its own local
- // reduction and result is distributed. So result types before and after
- // distribution should not match.
return resTy != resDistTypeOrFailure.value();
});
// vector::ReductionOp is legal.
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index d28fe51ab91b1..077d92335c089 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -275,4 +275,46 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
[0] : vector<16x2xf32> to vector<2xf32>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4x1xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST]], %[[CST_0]] [0] : vector<4x1xf32> to vector<1xf32>
+// CHECK: gpu.return
+gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %src = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<0.0> : vector<4x16xf32>
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+ dense<0.0> : vector<16xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+ }
+ [0] : vector<4x16xf32> to vector<16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x12xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST]], %[[CST_0]] [1] : vector<1x12xf32> to vector<1xf32>
+// CHECK: gpu.return
+gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %src = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ dense<0.0> : vector<16x12xf32>
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
+ dense<0.0> : vector<16xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
+ }
+ [1] : vector<16x12xf32> to vector<16xf32>
+ gpu.return
+}
}
>From 67d7f6e0ead4d737d2ed4766a0cfe3b8743e79e9 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:10:20 +0000
Subject: [PATCH 09/17] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 84 ++++++++++---------
1 file changed, 44 insertions(+), 40 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 7c641f788e842..294e49ed543af 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -88,6 +88,39 @@ static LogicalResult verifyLayouts(Operation *root) {
return walkResult.wasInterrupted() ? failure() : success();
}
+/// A vector::MultiDimReductionOp at subgroup level in expected form if, it has
+/// exactly 1 reduction dimension, it had valid result layout attribute, and
+/// result type can be distributed to lanes using the layout.
+static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ // If no layout, mark legal.
+ if (!resLayout || !resLayout.isForSubgroup())
+ return false;
+ VectorType resTy = dyn_cast<VectorType>(op.getType());
+ if (!resTy)
+ return false;
+ // Compute the distributed result vector type based on the layout.
+ FailureOr<VectorType> resDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
+ if (failed(resDistTypeOrFailure))
+ return false;
+ return op.getReductionDims().size() == 1;
+}
+
+/// A vector::MultiDimReductionOp is doing lane-local reduction if each workitem
+/// is doing its own local reduction. In this case the result layout ensures
+/// that result vector is distributed to lanes, i.e. the result vector type is
+/// different from the distributed result vector type.
+static bool isReductionLaneLocal(vector::MultiDimReductionOp op) {
+ // Must be valid MultiDimReductionOp.
+ assert(isValidSubgroupMultiReductionOp(op) && "Expecting a valid subgroup "
+ "MultiDimReductionOp");
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ VectorType resTy = dyn_cast<VectorType>(op.getType());
+ auto resDistTypeOrFailure = getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
+ return resTy != resDistTypeOrFailure.value();
+}
+
/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
/// op. This simply drops the layout attribute from the tensor descriptor type.
struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
@@ -719,29 +752,14 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
return !layout;
});
- // vector::MultiDimReductionOp is legal only if its result has no distribute
- // layout attribute.
+ // vector::MultiDimReductionOp op legality.
target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
[=](vector::MultiDimReductionOp op) -> bool {
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- // If no layout, mark legal.
- if (!resLayout || !resLayout.isForSubgroup())
- return true;
- VectorType resTy = dyn_cast<VectorType>(op.getType());
- if (!resTy)
- return true;
- // Compute the distributed result vector type based on the layout.
- FailureOr<VectorType> resDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
- if (failed(resDistTypeOrFailure))
+ // Check common conditions for subgroup multi reduction op.
+ if (!isValidSubgroupMultiReductionOp(op))
return true;
-
- ArrayRef<int64_t> reductionDims = op.getReductionDims();
- // Only 1 reduction dimension supported.
- if (reductionDims.size() != 1)
- return true;
- // Op is legal if the reduction dimension is distributed.
- return resTy == resDistTypeOrFailure.value();
+ // Lane local reductions are illegal at this point and must be lowered.
+ return !isReductionLaneLocal(op);
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
@@ -752,29 +770,15 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
RewritePatternSet &patterns, ConversionTarget &target) {
- // vector::MultiDimReductionOp is legal only if the reduction dimension is
- // not distributed.
+ // vector::MultiDimReductionOp legality.
target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
[&](vector::MultiDimReductionOp op) {
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- // If no layout, mark legal.
- if (!resLayout || !resLayout.isForSubgroup())
- return true;
- VectorType resTy = dyn_cast<VectorType>(op.getType());
- if (!resTy)
- return true;
- // Compute the distributed result vector type based on the layout.
- FailureOr<VectorType> resDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resTy);
- if (failed(resDistTypeOrFailure))
- return true;
-
- ArrayRef<int64_t> reductionDims = op.getReductionDims();
- // Only 1 reduction dimension supported.
- if (reductionDims.size() != 1)
+ // Check common conditions for subgroup multi reduction op.
+ if (!isValidSubgroupMultiReductionOp(op))
return true;
- // Op is legal if the reduction dimension is not distributed.
- return resTy != resDistTypeOrFailure.value();
+ // Lane local reductions are legal. We only rewrite non-lane-local
+ // reductions.
+ return isReductionLaneLocal(op);
});
// vector::ReductionOp is legal.
target.addDynamicallyLegalOp<vector::ReductionOp>(
>From 113c8e19424ee8f0a8a924ebdd7697cfc273b726 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:15:38 +0000
Subject: [PATCH 10/17] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 41 ++++---------------
1 file changed, 9 insertions(+), 32 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 294e49ed543af..79c2179f8f035 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -508,43 +508,20 @@ struct LowerVectorMultiReductionPattern
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- // If no layout, nothing to do.
- if (!resLayout || !resLayout.isForSubgroup())
- return failure();
- VectorType sourceType = op.getSourceVectorType();
- // Only 2D vectors are supported.
- if (sourceType.getRank() != 2)
+ // Check validity for lowering at subgroup level.
+ if (!isValidSubgroupMultiReductionOp(op))
return rewriter.notifyMatchFailure(
- op, "Expecting 2D source vector in vector::MultiDimReductionOp at "
- "subgroup level.");
- VectorType resType = dyn_cast<VectorType>(op.getType());
- if (!resType)
- return rewriter.notifyMatchFailure(op, "Expecting vector result type");
- // Compute the distributed vector type based on the layout.
- FailureOr<VectorType> resDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resType);
- if (failed(resDistTypeOrFailure))
+ op, "Not a valid subgroup multi reduction op that can be lowered.");
+ // Only non-lane-local reduction is handled here.
+ if (isReductionLaneLocal(op))
return rewriter.notifyMatchFailure(
- op, "Failed to compute the distributed vector type based on the "
- "layout.");
-
+ op, "Reduction is lane-local, it does not require rewrite.");
ArrayRef<int64_t> reductionDims = op.getReductionDims();
- // Only 1 reduction dimension supported.
- if (reductionDims.size() != 1)
- return rewriter.notifyMatchFailure(
- op, "Only 1 reduction dimension is supported.");
-
+ assert(
+ reductionDims.size() == 1 &&
+ "Expecting single reduction dimension for subgroup multi reduction op");
int64_t reductionDim = reductionDims[0];
- // Rewrite is only needed when when the reduction is not lane-local.
- // If the reduction is lane-local, result type is not the same before and
- // after distribution (i.e. result is distributed to lanes not shared.)
- bool reductionDimDistributed =
- resType.getShape() != resDistTypeOrFailure.value().getShape();
- if (reductionDimDistributed)
- return failure();
-
// Rewrite MultiDimReductionOp into a sequence of ReductionOps.
Value result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(op.getSource()),
>From 14d0f1e3a378b199d85b1afdc57cff4abab83ae5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:25:34 +0000
Subject: [PATCH 11/17] save work
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 09fb1cef4e84e..5fdab1e759deb 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -664,7 +664,7 @@ Value xegpu::subgroupReduction(Location loc, OpBuilder &builder, Value input,
laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
}
return laneVal;
-};
+}
Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
TypedValue<VectorType> acc,
>From 9b87ee2197803950548efffc3c979483ff50e62d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:35:23 +0000
Subject: [PATCH 12/17] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 41 ++++++-------------
1 file changed, 12 insertions(+), 29 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 79c2179f8f035..272fcba7b4e5e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -459,38 +459,21 @@ struct SgToWiMultiDimReduction
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- // If no layout, nothing to do.
- if (!resLayout || !resLayout.isForSubgroup())
- return failure();
- VectorType sourceType = op.getSourceVectorType();
- // Only 2D vectors are supported.
- if (sourceType.getRank() != 2)
- return rewriter.notifyMatchFailure(
- op, "Expecting 2D source vector in vector::MultiDimReductionOp at "
- "subgroup level.");
- VectorType resVecTy = dyn_cast<VectorType>(op.getType());
- if (!resVecTy)
- return rewriter.notifyMatchFailure(op, "Expecting vector result type");
- // Compute the distributed vector type based on the layout.
- FailureOr<VectorType> resDistVecTyOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
- if (failed(resDistVecTyOrFailure))
- return rewriter.notifyMatchFailure(
- op, "Failed to compute the distributed vector type based on the "
- "layout.");
- // Check if the reduction is single dim.
- ArrayRef<int64_t> reductionDims = op.getReductionDims();
- if (reductionDims.size() != 1)
+ // Check if the reduction op is valid for distribution.
+ if (!isValidSubgroupMultiReductionOp(op))
return rewriter.notifyMatchFailure(
- op, "Only 1 reduction dimension is supported.");
- // Check if the reduction is lane-local. If not, distributed result type ==
- // original result type. This case is handled by
- // `LowerVectorMultiReductionPattern`.
- if (resVecTy == resDistVecTyOrFailure.value())
+ op,
+ "Not a valid subgroup multi reduction op that can be distributed.");
+ // Only lane-local reduction is handled here.
+ if (!isReductionLaneLocal(op))
return rewriter.notifyMatchFailure(
- op, "Reduction is not lane-local, expected reduction dimension to be "
+ op, "Only lane-local reduction is supported, expected reduction "
+ "dimension to be "
"not distributed.");
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ VectorType resVecTy = dyn_cast<VectorType>(op.getType());
+ auto resDistVecTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
// Simply create a new MultiDimReductionOp using adaptor operands and the
// new result type.
auto newOp = vector::MultiDimReductionOp::create(
>From 6e05f11fd45064a0d728f86a829a50ac8757e8cd Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:41:11 +0000
Subject: [PATCH 13/17] add comments
---
.../Transforms/XeGPUSgToWiDistributeExperimental.cpp | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 272fcba7b4e5e..2edcaa182f7aa 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -452,6 +452,10 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
}
};
+/// This pattern distributes a subgroup-level vector.multi_reduction op to
+/// workitem-level only if the reduction is lane-local. This means that
+/// reduction dimension is not distributed to lanes and each lane does its own
+/// local reduction.
struct SgToWiMultiDimReduction
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
@@ -484,6 +488,12 @@ struct SgToWiMultiDimReduction
}
};
+/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
+/// of vector.extract_strided_slice, vector.reduction and
+/// vector.insert_strided_slice ops. This is used when the reduction dimension
+/// is distributed to lanes and a naive (lane-local) distribution is not
+/// possible. Then later on, these partilly lowered subgroup-level ops are
+/// further lowered to workitem-level by respective patterns.
struct LowerVectorMultiReductionPattern
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
>From 0d1f466b349e4de6b8fab002735ce6f04cb1e62e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 10 Feb 2026 00:44:58 +0000
Subject: [PATCH 14/17] fix test
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 38 +++++++++----------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 077d92335c089..1ec0879d4fb47 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -155,25 +155,25 @@ gpu.func @prefetch_nd() {
}
// CHECK-LABEL: gpu.func @vector_reduction
-// CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK: %[[LANE_RED:.*]] = vector.reduction <add>, %[[CAST:.*]] : vector<2xf32> into f32
-// CHECK: %[[C16_1:.*]] = arith.constant 16 : i32
-// CHECK: %[[C1:.*]] = arith.constant 1 : i32
-// CHECK: %[[SHUFFLE1:.*]], %{{.*}} = gpu.shuffle xor %[[LANE_RED]], %[[C1]], %[[C16_1]] : f32
-// CHECK: %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1]] : f32
-// CHECK: %[[C16_2:.*]] = arith.constant 16 : i32
-// CHECK: %[[C2:.*]] = arith.constant 2 : i32
-// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD1]], %[[C2]], %[[C16_2]] : f32
-// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2]] : f32
-// CHECK: %[[C16_3:.*]] = arith.constant 16 : i32
-// CHECK: %[[C4:.*]] = arith.constant 4 : i32
-// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %[[C4]], %[[C16_3]] : f32
-// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
-// CHECK: %[[C16_4:.*]] = arith.constant 16 : i32
-// CHECK: %[[C8:.*]] = arith.constant 8 : i32
-// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %[[C8]], %[[C16_4]] : f32
-// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
-// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
+// CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[LANE_RED:.*]] = vector.reduction <add>, %[[CAST:.*]] : vector<2xf32> into f32
+// CHECK-DAG: %[[C16_1:.*]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE1:.*]], %{{.*}} = gpu.shuffle xor %[[LANE_RED]], %[[C1]], %[[C16_1]] : f32
+// CHECK: %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1]] : f32
+// CHECK-DAG: %[[C16_2:.*]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD1]], %[[C2]], %[[C16_2]] : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2]] : f32
+// CHECK-DAG: %[[C16_3:.*]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %[[C4]], %[[C16_3]] : f32
+// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK-DAG: %[[C16_4:.*]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %[[C8]], %[[C16_4]] : f32
+// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
gpu.func @vector_reduction() {
%acc = arith.constant 1.0 : f32
%0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : () -> vector<32xf32>
>From 86d705e00ac1f17eac8fb40aed8df11cc6b2b8a0 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 11 Feb 2026 22:46:51 +0000
Subject: [PATCH 15/17] fix test
---
.../mlir/Dialect/XeGPU/Transforms/Transforms.h | 5 ++++-
.../XeGPUSgToWiDistributeExperimental.cpp | 16 +++-------------
2 files changed, 7 insertions(+), 14 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 695f65b71bac6..6f6d58d4ab605 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -84,7 +84,10 @@ void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
ConversionTarget &target);
/// Appends patterns to rewrite vector::MultiDimReductionOp in terms of
/// vector::ReductionOps if the multi-reduction involves cross-lane data
-/// movement.
+/// movement. This pattern is used as pre-processing step before applying
+/// subgroup to workitem distribution patterns. This pattern will rewrite a
+/// multi reduction in terms of a series of simpler extract, reduction and
+/// insert ops if the reduction require cross-lane data movement.
void populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
RewritePatternSet &patterns, ConversionTarget &target);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 2edcaa182f7aa..b62b63ccad3ce 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -463,11 +463,6 @@ struct SgToWiMultiDimReduction
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- // Check if the reduction op is valid for distribution.
- if (!isValidSubgroupMultiReductionOp(op))
- return rewriter.notifyMatchFailure(
- op,
- "Not a valid subgroup multi reduction op that can be distributed.");
// Only lane-local reduction is handled here.
if (!isReductionLaneLocal(op))
return rewriter.notifyMatchFailure(
@@ -492,7 +487,7 @@ struct SgToWiMultiDimReduction
/// of vector.extract_strided_slice, vector.reduction and
/// vector.insert_strided_slice ops. This is used when the reduction dimension
/// is distributed to lanes and a naive (lane-local) distribution is not
-/// possible. Then later on, these partilly lowered subgroup-level ops are
+/// possible. Then later on, these partially lowered subgroup-level ops are
/// further lowered to workitem-level by respective patterns.
struct LowerVectorMultiReductionPattern
: public OpConversionPattern<vector::MultiDimReductionOp> {
@@ -501,10 +496,6 @@ struct LowerVectorMultiReductionPattern
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- // Check validity for lowering at subgroup level.
- if (!isValidSubgroupMultiReductionOp(op))
- return rewriter.notifyMatchFailure(
- op, "Not a valid subgroup multi reduction op that can be lowered.");
// Only non-lane-local reduction is handled here.
if (isReductionLaneLocal(op))
return rewriter.notifyMatchFailure(
@@ -513,13 +504,12 @@ struct LowerVectorMultiReductionPattern
assert(
reductionDims.size() == 1 &&
"Expecting single reduction dimension for subgroup multi reduction op");
- int64_t reductionDim = reductionDims[0];
// Rewrite MultiDimReductionOp into a sequence of ReductionOps.
Value result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(op.getSource()),
- cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(), reductionDim,
- op.getLoc(), rewriter);
+ cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
+ reductionDims[0], op.getLoc(), rewriter);
rewriter.replaceOp(op, result);
return success();
>From aa93972b4599553d9e91c8ecb8dd8305d9246d3f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 11 Feb 2026 22:48:51 +0000
Subject: [PATCH 16/17] fix test
---
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b62b63ccad3ce..d07c1350e30e0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -93,7 +93,7 @@ static LogicalResult verifyLayouts(Operation *root) {
/// result type can be distributed to lanes using the layout.
static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {
auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- // If no layout, mark legal.
+ // If no layout, not valid.
if (!resLayout || !resLayout.isForSubgroup())
return false;
VectorType resTy = dyn_cast<VectorType>(op.getType());
>From cc7635e5b30d23cd70438da0bad4e7566ee792c9 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 13 Feb 2026 19:37:54 +0000
Subject: [PATCH 17/17] fix test
---
.../XeGPUSgToWiDistributeExperimental.cpp | 20 ++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index d07c1350e30e0..3787fbb44e1b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -411,24 +411,30 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
if (!layout || !layout.isForSubgroup())
return failure();
- VectorType vectorType = op.getSourceVectorType();
-
+ VectorType srcVecType = op.getSourceVectorType();
// Only rank 1 vectors supported.
- if (vectorType.getRank() != 1)
+ if (srcVecType.getRank() != 1)
return rewriter.notifyMatchFailure(
op, "Only rank 1 reductions can be distributed.");
// Lane layout must have the same rank as the vector.
- if (layout.getRank() != vectorType.getRank())
+ if (layout.getRank() != srcVecType.getRank())
return rewriter.notifyMatchFailure(
op, "Layout rank does not match vector rank.");
// Get the subgroup size from the layout.
int64_t sgSize = layout.getEffectiveLaneLayoutAsInt()[0];
+ const auto *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ op, "xegpu::ReductionOp require target attribute attached to "
+ "determine subgroup size");
// Only subgroup-sized vectors supported.
- if (vectorType.getShape()[0] % sgSize != 0)
- return rewriter.notifyMatchFailure(
- op, "Reduction vector dimension must match subgroup size.");
+ if (sgSize != uArch->getSubgroupSize() ||
+ srcVecType.getShape()[0] % sgSize != 0)
+ return rewriter.notifyMatchFailure(op,
+ "Invalid layout or reduction vector "
+ "dimension must match subgroup size.");
if (!op.getType().isIntOrFloat())
return rewriter.notifyMatchFailure(
More information about the Mlir-commits
mailing list