[Mlir-commits] [mlir] [MLIR][XeGPU] Enable one-step subgroup distribution of cross-lane reduction to shuffle op (PR #182698)
Jianhui Li
llvmlistbot at llvm.org
Tue Feb 24 14:51:00 PST 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/182698
>From 73a49452cbb9172d05a26845f540e78a28a803ff Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Feb 2026 18:39:47 +0000
Subject: [PATCH 01/14] change subgroup distribution of reduction cross lane to
use shuffle directly
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 7 +
.../Transforms/XeGPUSubgroupDistribute.cpp | 77 ++++++++---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 122 ++++++++++++++++--
3 files changed, 176 insertions(+), 30 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index ebf50c4cd57de..91b7c2202e56b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,6 +147,13 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
vector::CombiningKind kind, int64_t reductionDim,
Location loc, PatternRewriter &rewriter);
+Value lowerToVectorReductionsCrossLane(TypedValue<VectorType> src,
+ TypedValue<VectorType> acc,
+ vector::CombiningKind kind,
+ int64_t reductionDim,
+ int64_t reductionSize, Location loc,
+ PatternRewriter &rewriter);
+
/// Helper Function to find a proper instruction multiple for the user-supplied
/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 99c2da386fab6..d82766f61338a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1337,6 +1337,21 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
+ // before get distribute vec type for source, first set its shape to be unit
+ // for the reduction dimension
+ SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
+ sourceType.getShape().end());
+ int64_t reductionDimSize = sourceShape[reductionDim];
+ // sourceShape[reductionDim] = 1;
+ // VectorType modifiedSourceType =
+ // VectorType::get(sourceShape, sourceType.getElementType());
+
+ // print out modifiedSourceType and sourceLayout for debugging
+ // LLVM_DEBUG({
+ // llvm::dbgs() << "modifiedSourceType: " << modifiedSourceType << "\n";
+ // llvm::dbgs() << "sourceLayout: " << sourceLayout << "\n";
+ // });
+
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
if (failed(sourceDistTypeOrFailure))
@@ -1372,6 +1387,17 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
(sourceDistDim == 1 && reductionDim == 0);
+ // print here all these five variables for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "sourceDistDim: " << sourceDistDim << "\n";
+ llvm::dbgs() << "reductionDim: " << reductionDim << "\n";
+ llvm::dbgs() << "isReductionLaneLocal: " << isReductionLaneLocal << "\n";
+ llvm::dbgs() << "resultDistributed: " << resultDistributed << "\n";
+ llvm::dbgs() << "sourceDistType: " << sourceDistType << "\n";
+ llvm::dbgs() << "distributedResultType: " << distributedResultType
+ << "\n";
+ });
+
if (isReductionLaneLocal && !resultDistributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed result for lane-local reduction.");
@@ -1381,33 +1407,46 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
warpOp,
"Expecting a broadcasted result for non-lane-local reduction.");
+ // Yield the source and acc vectors from the WarpOp.
+ SmallVector<size_t> newRetIndices;
+ auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
+ {sourceDistType, distributedResultType}, newRetIndices);
+ rewriter.setInsertionPointAfter(newWarpOp);
+
+ Value result;
// Handle lane-local reduction case. In this case we fully distribute the
// reduction result.
if (isReductionLaneLocal) {
- // Yield the source and acc vectors from the WarpOp.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
- {sourceDistType, distributedResultType}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- Value result = xegpu::lowerToVectorReductions(
+
+ result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
- // Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
- return success();
+ // print the reduction op for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "reductionOp1: " << *reductionOp << "\n";
+ llvm::dbgs() << "lowered reduction result1: " << result << "\n";
+ });
+
+ } else {
+ // For non-lane-local case, we simply rewrite the MultiReductionOp in
+ // terms of multiple ReductionOps. Actual distribution is done by the
+ // WarpOpReduction pattern.
+ // rewriter.setInsertionPointAfter(reductionOp);
+ result = xegpu::lowerToVectorReductionsCrossLane(
+ cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
+ cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
+ reductionOp.getKind(), reductionDim, reductionDimSize,
+ reductionOp.getLoc(), rewriter);
+ // print the reduction op for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "reductionOp2: " << *reductionOp << "\n";
+ llvm::dbgs() << "lowered reduction result2: " << result << "\n";
+ });
}
- // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
- // of multiple ReductionOps. Actual distribution is done by the
- // WarpOpReduction pattern.
- rewriter.setInsertionPointAfter(reductionOp);
- Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(reductionOp.getSource()),
- cast<TypedValue<VectorType>>(reductionOp.getAcc()),
- reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
return success();
}
};
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 5fdab1e759deb..301b779c955d2 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -121,13 +121,36 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
// dimensions are not distributed.
unsigned distributionStart =
originalType.getRank() - effectiveLaneLayout.size();
+
+ // Print original shape and lane layout for debugging
+ std::string shapeStr = "[";
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i > 0)
+ shapeStr += ", ";
+ shapeStr += std::to_string(dim);
+ }
+ shapeStr += "]";
+ LDBG() << "original shape: " << shapeStr;
+
+ std::string layoutStr = "[";
+ for (auto [i, dim] : llvm::enumerate(effectiveLaneLayout)) {
+ if (i > 0)
+ layoutStr += ", ";
+ layoutStr += std::to_string(dim);
+ }
+ layoutStr += "]";
+ LDBG() << "effective lane layout: " << layoutStr;
+
for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
if (i < distributionStart)
continue;
// Check if the dimension can be distributed evenly.
- if (dim % effectiveLaneLayout[i - distributionStart] != 0)
- return failure();
- distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+ if (dim % effectiveLaneLayout[i - distributionStart] != 0) {
+ assert( effectiveLaneLayout[i - distributionStart] % dim == 0 &&
+ "The dimension size must be able evenly distributed to all lanes in round-robin manner.");
+ distributedShape[i] = 1;
+ } else
+ distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
}
return VectorType::get(distributedShape, originalType.getElementType());
}
@@ -682,10 +705,10 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
- auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
- auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
- // Reduction result should have the same layout as the accumulator.
- xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+ // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+ // // Reduction result should have the same layout as the accumulator.
+ // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
@@ -702,7 +725,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
// Extract strided slice has the same layout as src.
- xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
+ // xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
@@ -713,8 +736,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
// Shape cast output has the same layout as the accumulator. Shape cast
// source has the same layout as the original reduction source.
- xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
- xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
+ // xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+ // xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
// Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
Value reduction = vector::ReductionOp::create(
@@ -722,7 +745,84 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
// Insert op should have the same layout as the accumulator.
- xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+ // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+ }
+ return reductionResult;
+}
+
+Value xegpu::lowerToVectorReductionsCrossLane(
+ TypedValue<VectorType> src, TypedValue<VectorType> acc,
+ vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
+ Location loc, PatternRewriter &rewriter) {
+ // Expecting a 2D source vector.
+ assert(src.getType().getRank() == 2 && "expected a 2D source vector");
+ VectorType sourceType = src.getType();
+ int64_t sourceH = sourceType.getShape()[0];
+ int64_t sourceW = sourceType.getShape()[1];
+ int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+ // Create a constant vector to hold the result of the reduction.
+ TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
+ Value reductionResult = arith::ConstantOp::create(
+ rewriter, loc, acc.getType(),
+ DenseElementsAttr::get(acc.getType(), zeroAttr));
+ // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+ // // Reduction result should have the same layout as the accumulator.
+ // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+
+ // print source shape, reduction dim and reduction size for debugging
+ std::string shapeStr = "[";
+ for (auto [i, dim] : llvm::enumerate(sourceType.getShape())) {
+ if (i > 0)
+ shapeStr += ", ";
+ shapeStr += std::to_string(dim);
+ }
+ shapeStr += "]";
+ LDBG() << "source shape: " << shapeStr;
+ LDBG() << "reduction dim: " << reductionDim;
+ LDBG() << "reduction size: " << reductionSize;
+
+ // For each slice of the source, extract the slice vector, do a reduction
+ // and, insert the reduced value back to the result vector.
+ for (int i = 0; i < nSlices; ++i) {
+ SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
+ if (reductionDim == 1) {
+ sliceOffsets = {i, 0};
+ sliceSizes = {1, sourceW};
+ } else {
+ sliceOffsets = {0, i};
+ sliceSizes = {sourceH, 1};
+ }
+
+ // print src, sliceOffsets, sliceSizes for debugging
+ LDBG() << "src: " << src;
+ LDBG() << "sliceOffsets: [" << sliceOffsets[0] << ", " << sliceOffsets[1]
+ << "]";
+ LDBG() << "sliceSizes: [" << sliceSizes[0] << ", " << sliceSizes[1] << "]";
+
+ vector::ExtractStridedSliceOp extractOp =
+ vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
+ sliceSizes, {1, 1});
+
+ int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+
+ vector::ShapeCastOp slice = vector::ShapeCastOp::create(
+ rewriter, loc,
+ VectorType::get({nSliceElements}, sourceType.getElementType()),
+ extractOp.getResult());
+
+ // Extract and reduction results in scalars, so no result layout is needed.
+ Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
+
+ // Distribute and reduce across work-items in the subgroup.
+ Value fullReduce =
+ xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
+
+ fullReduce =
+ vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
+
+ reductionResult =
+ vector::InsertOp::create(rewriter, loc, fullReduce, reductionResult, i);
}
return reductionResult;
}
>From 0b5be3187723bed4ea2513851459d6ee86d55620 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 19 Feb 2026 23:57:56 +0000
Subject: [PATCH 02/14] add temporary layout back
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 301b779c955d2..e0a1de1af45aa 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -705,10 +705,10 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
- // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
- // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
- // // Reduction result should have the same layout as the accumulator.
- // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+ auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+ // Reduction result should have the same layout as the accumulator.
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
@@ -725,7 +725,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
// Extract strided slice has the same layout as src.
- // xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
+ xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
@@ -736,8 +736,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
// Shape cast output has the same layout as the accumulator. Shape cast
// source has the same layout as the original reduction source.
- // xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
- // xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
+ xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+ xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
// Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
Value reduction = vector::ReductionOp::create(
@@ -745,7 +745,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
// Insert op should have the same layout as the accumulator.
- // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
}
return reductionResult;
}
>From d22fdd2cfec585c6628aa318a80c8f4d8b19d402 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 01:31:32 +0000
Subject: [PATCH 03/14] modifying tests
---
.../XeGPU/subgroup-distribute-unit.mlir | 74 ++++++++++++-------
1 file changed, 46 insertions(+), 28 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index fb23f38b44b46..60fe29b7c9338 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -283,14 +283,30 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
gpu.return
}
-
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
-// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT: %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK: %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK: %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[T2]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
+// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
+// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[T1]] : f32
+// CHECK: %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK: %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
+// CHECK: %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle xor %[[T9]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
+// CHECK: %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
+// CHECK: %[[SHUFFLE7:.*]], %{{.*}} = gpu.shuffle xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUFFLE7]] : f32
+// CHECK: %[[SHUFFLE8:.*]], %{{.*}} = gpu.shuffle xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUFFLE8]] : f32
+// CHECK: %[[FINAL2:.*]] = arith.addf %[[ADD8]], %[[T8]] : f32
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -352,27 +368,29 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"()
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-// CHECK-SAME: : () -> vector<16x2xf32>
-// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]]
-// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
-// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]]
-// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
+// CHECK: %[[V0:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK: %[[V1:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<1x2xf32>
+// CHECK: %[[SHUF1:.*]], %[[VALID1:.*]] = gpu.shuffle xor %[[V1]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD1:.*]] = arith.addf %[[V1]], %[[SHUF1]] : f32
+// CHECK: %[[SHUF2:.*]], %[[VALID2:.*]] = gpu.shuffle xor %[[ADD1]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUF2]] : f32
+// CHECK: %[[SHUF3:.*]], %[[VALID3:.*]] = gpu.shuffle xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUF3]] : f32
+// CHECK: %[[SHUF4:.*]], %[[VALID4:.*]] = gpu.shuffle xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUF4]] : f32
+// CHECK: %[[RES0:.*]] = arith.addf %[[ADD4]], %[[V0]] : f32
+// CHECK: %[[V2:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK: %[[V3:.*]] = vector.extract %{{.*}}#1[0, 1] : f32 from vector<1x2xf32>
+// CHECK: %[[SHUF5:.*]], %[[VALID5:.*]] = gpu.shuffle xor %[[V3]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD5:.*]] = arith.addf %[[V3]], %[[SHUF5]] : f32
+// CHECK: %[[SHUF6:.*]], %[[VALID6:.*]] = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUF6]] : f32
+// CHECK: %[[SHUF7:.*]], %[[VALID7:.*]] = gpu.shuffle xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUF7]] : f32
+// CHECK: %[[SHUF8:.*]], %[[VALID8:.*]] = gpu.shuffle xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUF8]] : f32
+// CHECK: %[[RES1:.*]] = arith.addf %[[ADD8]], %[[V2]] : f32
+// CHECK: %[[RESULT:.*]] = vector.from_elements %[[RES0]], %[[RES1]] : vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
>From 89ca00db91d280fa396da7c37b42601517e8b975 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 02:50:36 +0000
Subject: [PATCH 04/14] remove empty lines
---
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir | 2 --
1 file changed, 2 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 60fe29b7c9338..f8d71350b7c52 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,8 +329,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
-
-
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
>From ad00de29fd88e8df3952c5da7936963eb4869af5 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 20 Feb 2026 03:27:08 +0000
Subject: [PATCH 05/14] adding tests
---
.../XeGPU/subgroup-distribute-unit.mlir | 38 +++++++++++++++++++
1 file changed, 38 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index f8d71350b7c52..1cea44af57459 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,6 +329,44 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK: %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
+// CHECK: %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[T2]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
+// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
+// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD2]], %[[T1]] : f32
+// CHECK: %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
+// CHECK: %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
+// CHECK: %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle xor %[[T9]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
+// CHECK: %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
+// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
+// CHECK: %[[FINAL2:.*]] = arith.addf %[[ADD6]], %[[T8]] : f32
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %src = "some_def"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> (vector<16x4xf32>)
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+ dense<0.0> : vector<2xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+ }
+ [1] : vector<16x4xf32> to vector<2xf32>
+ gpu.yield %1 : vector<2xf32>
+ }
+ "some_user_op"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
>From a25ff3504aa4b3affb90f7ecdf923ee81901cd5a Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 04:50:08 +0000
Subject: [PATCH 06/14] adding support to new subgroup distribution mechanism
---
.../XeGPUSgToWiDistributeExperimental.cpp | 88 ++++++---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 174 ++++++++++--------
.../XeGPU/subgroup-distribute-unit.mlir | 38 ----
3 files changed, 168 insertions(+), 132 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..4237507c7f016 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -469,22 +469,48 @@ struct SgToWiMultiDimReduction
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ Value result;
// Only lane-local reduction is handled here.
- if (!isReductionLaneLocal(op))
- return rewriter.notifyMatchFailure(
- op, "Only lane-local reduction is supported, expected reduction "
- "dimension to be "
- "not distributed.");
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- VectorType resVecTy = dyn_cast<VectorType>(op.getType());
- auto resDistVecTyOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
- // Simply create a new MultiDimReductionOp using adaptor operands and the
- // new result type.
- auto newOp = vector::MultiDimReductionOp::create(
- rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
- adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
- rewriter.replaceOp(op, newOp.getResult());
+ if (isReductionLaneLocal(op)) {
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ VectorType resVecTy = dyn_cast<VectorType>(op.getType());
+ auto resDistVecTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
+ // Simply create a new MultiDimReductionOp using adaptor operands and the
+ // new result type.
+ result = vector::MultiDimReductionOp::create(
+ rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
+ adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
+ } else {
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ assert(reductionDims.size() == 1 &&
+ "Expecting single reduction dimension for subgroup multi "
+ "reduction op");
+ // print adaptor.getSource() and adaptor.getAcc() for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
+ llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
+ });
+ // before get distribute vec type for source, first set its shape to be
+ // unit
+ // for the reduction dimension
+ auto reductionDim = reductionDims[0];
+ VectorType sourceType = op.getSourceVectorType();
+ SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
+ sourceType.getShape().end());
+ int64_t reductionDimSize = sourceShape[reductionDim];
+ result = xegpu::lowerToVectorReductionsCrossLane(
+ cast<TypedValue<VectorType>>(adaptor.getSource()),
+ cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
+ reductionDim, reductionDimSize, op.getLoc(), rewriter);
+ // print the reduction op for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "reductionOp3: " << *op << "\n";
+ llvm::dbgs() << "lowered reduction result3: " << result << "\n";
+ });
+ }
+
+ rewriter.replaceOp(op, result);
return success();
}
};
@@ -511,11 +537,31 @@ struct LowerVectorMultiReductionPattern
reductionDims.size() == 1 &&
"Expecting single reduction dimension for subgroup multi reduction op");
- // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
- Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(op.getSource()),
- cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
- reductionDims[0], op.getLoc(), rewriter);
+ // // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
+ // Value result = xegpu::lowerToVectorReductions(
+ // cast<TypedValue<VectorType>>(op.getSource()),
+ // cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
+ // reductionDims[0], op.getLoc(), rewriter);
+
+ // For non-lane-local case, we simply rewrite the MultiReductionOp in
+ // terms of multiple ReductionOps. Actual distribution is done by the
+ // WarpOpReduction pattern.
+ // rewriter.setInsertionPointAfter(reductionOp);
+
+ // print adaptor.getSource() and adaptor.getAcc() for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
+ llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
+ });
+ Value result = xegpu::lowerToVectorReductionsCrossLane(
+ cast<TypedValue<VectorType>>(adaptor.getSource()),
+ cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
+ reductionDims[0], /*reductionDimSize=*/1, op.getLoc(), rewriter);
+ // print the reduction op for debugging
+ LLVM_DEBUG({
+ llvm::dbgs() << "reductionOp3: " << *op << "\n";
+ llvm::dbgs() << "lowered reduction result3: " << result << "\n";
+ });
rewriter.replaceOp(op, result);
return success();
@@ -725,7 +771,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
if (!isValidSubgroupMultiReductionOp(op))
return true;
// Lane local reductions are illegal at this point and must be lowered.
- return !isReductionLaneLocal(op);
+ return false; // !isReductionLaneLocal(op);
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..abf3ddcc3d373 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,12 +2,6 @@
// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
-// RUN: mlir-opt --allow-unregistered-dialect \
-// RUN: --test-xegpu-sg-to-wi-distribute-experimental="enable-rewrite-multi-reduction-to-reductions" \
-// RUN: --split-input-file %s | FileCheck --check-prefix=CHECK-REWRITE %s
-
-
-
gpu.module @xevm_module {
// CHECK-LABEL: gpu.func @create_nd_tdesc
// CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -181,43 +175,56 @@ gpu.func @vector_reduction() {
gpu.return
}
-
-// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK-REWRITE-DAG: %[[SRC:.*]] = "some_def"() {layout_result_0 =
-// CHECK-REWRITE-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
-// CHECK-REWRITE-DAG: %[[ACC:.*]] = arith.constant
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE-DAG: %[[ZERO:.*]] = arith.constant
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE: %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME: offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
-// CHECK-REWRITE-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
-// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: : vector<1x16xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT: %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT: %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME: offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
-// CHECK-REWRITE-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
-// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: : vector<1x16xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT: %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT: %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
+// CHECK: %c16_i32 = arith.constant 16 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c16_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c16_i32_2 = arith.constant 16 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c16_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %c16_i32_5 = arith.constant 16 : i32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle xor %5, %c4_i32, %c16_i32_5 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
+// CHECK: %c16_i32_8 = arith.constant 16 : i32
+// CHECK: %c8_i32 = arith.constant 8 : i32
+// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle xor %6, %c8_i32, %c16_i32_8 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
+// CHECK: %8 = arith.addf %7, %2 : f32
+// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
+// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
+// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
+// CHECK: %c16_i32_11 = arith.constant 16 : i32
+// CHECK: %c1_i32_12 = arith.constant 1 : i32
+// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle xor %13, %c1_i32_12, %c16_i32_11 : f32
+// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
+// CHECK: %c16_i32_15 = arith.constant 16 : i32
+// CHECK: %c2_i32_16 = arith.constant 2 : i32
+// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle xor %14, %c2_i32_16, %c16_i32_15 : f32
+// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
+// CHECK: %c16_i32_19 = arith.constant 16 : i32
+// CHECK: %c4_i32_20 = arith.constant 4 : i32
+// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle xor %15, %c4_i32_20, %c16_i32_19 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
+// CHECK: %c16_i32_23 = arith.constant 16 : i32
+// CHECK: %c8_i32_24 = arith.constant 8 : i32
+// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle xor %16, %c8_i32_24, %c16_i32_23 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
+// CHECK: %18 = arith.addf %17, %12 : f32
+// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = "some_def"()
+ %src = arith.constant
{layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<2x16xf32>)
+ dense<0.0> : vector<2x16xf32>
%acc = arith.constant
{layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
dense<0.0> : vector<2xf32>
@@ -229,42 +236,63 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
-// CHECK-REWRITE-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK-REWRITE-DAG: %[[SRC:.*]] = "some_def"() {layout_result_0 =
-// CHECK-REWRITE-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
-// CHECK-REWRITE-DAG: %[[ACC:.*]] = arith.constant
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE-DAG: %[[ZERO:.*]] = arith.constant
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME: dense<0.000000e+00> : vector<2xf32>
-// CHECK-REWRITE: %[[SLICE0:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REWRITE-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[SLICE0]]
-// CHECK-REWRITE-SAME: {{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT: %[[ACC0:.*]] = vector.extract %[[ACC]][0] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %[[ACC0]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT: %[[INS0:.*]] = vector.insert %[[RED0]], %[[ZERO]] [0]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[SLICE1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-REWRITE-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REWRITE-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[SLICE1]]
-// CHECK-REWRITE-SAME: {{{.*}}, layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
-// CHECK-REWRITE-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK-REWRITE-NEXT: %[[ACC1:.*]] = vector.extract %[[ACC]][1] : f32 from vector<2xf32>
-// CHECK-REWRITE-NEXT: %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %[[ACC1]] : vector<16xf32> into f32
-// CHECK-REWRITE-NEXT: %[[INS1:.*]] = vector.insert %[[RED1]], %[[INS0]] [1]
-// CHECK-REWRITE-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-REWRITE-SAME: : f32 into vector<2xf32>
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x1xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<1xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c4_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c4_i32_2 = arith.constant 4 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c4_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %6 = arith.addf %5, %2 : f32
+// CHECK: %7 = vector.insert %6, %cst_1 [0] : f32 into vector<1xf32>
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %src = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<0.0> : vector<1x4xf32>
+ %acc = arith.constant
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+ dense<0.0> : vector<1xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
+ {
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+ }
+ [1] : vector<1x4xf32> to vector<1xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
+// CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %c1_i32 = arith.constant 1 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle xor %5, %c1_i32, %c2_i32 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult : f32
+// CHECK: %7 = arith.addf %6, %4 : f32
+// CHECK: %8 = vector.insert %7, %cst_0 [0] : f32 into vector<2xf32>
+// CHECK: %9 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %10 = vector.shape_cast %9 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %11 = vector.extract %cst[1] : f32 from vector<2xf32>
+// CHECK: %12 = vector.reduction <add>, %10 : vector<1xf32> into f32
+// CHECK: %c2_i32_1 = arith.constant 2 : i32
+// CHECK: %c1_i32_2 = arith.constant 1 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %12, %c1_i32_2, %c2_i32_1 : f32
+// CHECK: %13 = arith.addf %12, %shuffleResult_3 : f32
+// CHECK: %14 = arith.addf %13, %11 : f32
+// CHECK: %15 = vector.insert %14, %8 [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%src = "some_def"()
{layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<16x2xf32>)
+ : () -> (vector<2x2xf32>)
%acc = arith.constant
{layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
dense<0.0> : vector<2xf32>
@@ -272,7 +300,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
{
layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
}
- [0] : vector<16x2xf32> to vector<2xf32>
+ [0] : vector<2x2xf32> to vector<2xf32>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 1cea44af57459..f8d71350b7c52 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -329,44 +329,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK: %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK: %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
-// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[T2]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
-// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
-// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD2]], %[[T1]] : f32
-// CHECK: %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK: %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
-// CHECK: %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle xor %[[T9]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
-// CHECK: %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
-// CHECK: %[[FINAL2:.*]] = arith.addf %[[ADD6]], %[[T8]] : f32
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reductio_over_partial_sg_size(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<16x4xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<2xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
- }
- [1] : vector<16x4xf32> to vector<2xf32>
- gpu.yield %1 : vector<2xf32>
- }
- "some_user_op"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
>From 6b5ebccda034291b46260ac664dcf2095e872ebf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 05:05:49 +0000
Subject: [PATCH 07/14] remove XeGPUSgToWiLowerVectorMultiReduction pattern
---
.../Dialect/XeGPU/Transforms/Transforms.h | 8 ---
.../XeGPUSgToWiDistributeExperimental.cpp | 72 -------------------
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 18 -----
3 files changed, 98 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 6f6d58d4ab605..6afd9c9d09369 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -82,14 +82,6 @@ void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target);
-/// Appends patterns to rewrite vector::MultiDimReductionOp in terms of
-/// vector::ReductionOps if the multi-reduction involves cross-lane data
-/// movement. This pattern is used as pre-processing step before applying
-/// subgroup to workitem distribution patterns. This pattern will rewrite a
-/// multi reduction in terms of a series of simpler extract, reduction and
-/// insert ops if the reduction require cross-lane data movement.
-void populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
- RewritePatternSet &patterns, ConversionTarget &target);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 4237507c7f016..b73459b0587b1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -515,59 +515,6 @@ struct SgToWiMultiDimReduction
}
};
-/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
-/// of vector.extract_strided_slice, vector.reduction and
-/// vector.insert_strided_slice ops. This is used when the reduction dimension
-/// is distributed to lanes and a naive (lane-local) distribution is not
-/// possible. Then later on, these partially lowered subgroup-level ops are
-/// further lowered to workitem-level by respective patterns.
-struct LowerVectorMultiReductionPattern
- : public OpConversionPattern<vector::MultiDimReductionOp> {
- using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
-
- LogicalResult
- matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- // Only non-lane-local reduction is handled here.
- if (isReductionLaneLocal(op))
- return rewriter.notifyMatchFailure(
- op, "Reduction is lane-local, it does not require rewrite.");
- ArrayRef<int64_t> reductionDims = op.getReductionDims();
- assert(
- reductionDims.size() == 1 &&
- "Expecting single reduction dimension for subgroup multi reduction op");
-
- // // Rewrite MultiDimReductionOp into a sequence of ReductionOps.
- // Value result = xegpu::lowerToVectorReductions(
- // cast<TypedValue<VectorType>>(op.getSource()),
- // cast<TypedValue<VectorType>>(op.getAcc()), op.getKind(),
- // reductionDims[0], op.getLoc(), rewriter);
-
- // For non-lane-local case, we simply rewrite the MultiReductionOp in
- // terms of multiple ReductionOps. Actual distribution is done by the
- // WarpOpReduction pattern.
- // rewriter.setInsertionPointAfter(reductionOp);
-
- // print adaptor.getSource() and adaptor.getAcc() for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
- llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
- });
- Value result = xegpu::lowerToVectorReductionsCrossLane(
- cast<TypedValue<VectorType>>(adaptor.getSource()),
- cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
- reductionDims[0], /*reductionDimSize=*/1, op.getLoc(), rewriter);
- // print the reduction op for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "reductionOp3: " << *op << "\n";
- llvm::dbgs() << "lowered reduction result3: " << result << "\n";
- });
-
- rewriter.replaceOp(op, result);
- return success();
- }
-};
-
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -779,22 +726,3 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
SgToWiVectorReduction, SgToWiMultiDimReduction>(
typeConverter, patterns.getContext());
}
-
-void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
- RewritePatternSet &patterns, ConversionTarget &target) {
- // vector::MultiDimReductionOp legality.
- target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
- [&](vector::MultiDimReductionOp op) {
- // Check common conditions for subgroup multi reduction op.
- if (!isValidSubgroupMultiReductionOp(op))
- return true;
- // Lane local reductions are legal. We only rewrite non-lane-local
- // reductions.
- return isReductionLaneLocal(op);
- });
- // vector::ReductionOp is legal.
- target.addDynamicallyLegalOp<vector::ReductionOp>(
- [&](vector::ReductionOp op) { return true; });
- target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
- patterns.add<LowerVectorMultiReductionPattern>(patterns.getContext());
-}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 33af2c5b33d89..4192ac46764dd 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -273,12 +273,6 @@ struct TestXeGPUSgToWiDistributeExperimental
"Work-item Distribution";
}
- Option<bool> enableRewriteMultiReductionToReductions{
- *this, "enable-rewrite-multi-reduction-to-reductions",
- llvm::cl::desc("Partially lower multi-reduction ops to reduction ops if "
- "the reduction dimension is distributed."),
- llvm::cl::init(false)};
-
void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
registry.insert<arith::ArithDialect>();
registry.insert<memref::MemRefDialect>();
@@ -306,18 +300,6 @@ struct TestXeGPUSgToWiDistributeExperimental
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
- // If `enableRewriteMultiReductionToReductions` is set, only focus on
- // testing the partial lowering of vector::MultiReductionOp.
- if (enableRewriteMultiReductionToReductions) {
- xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
- ConversionTarget target(*ctx);
- RewritePatternSet patterns(ctx);
- xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(patterns,
- target);
- (void)applyPartialConversion(getOperation(), target, std::move(patterns));
- return;
- }
-
ConversionTarget target(*ctx);
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
>From 1b9165e61b0fad8dc9a7d325a6cb1c133ac30a49 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 05:14:15 +0000
Subject: [PATCH 08/14] remove the multi-reduction pattern change in old warp
op based distribution mechanism
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 77 +++++--------------
.../XeGPU/subgroup-distribute-unit.mlir | 76 ++++++++----------
2 files changed, 49 insertions(+), 104 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d82766f61338a..99c2da386fab6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1337,21 +1337,6 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
- // before get distribute vec type for source, first set its shape to be unit
- // for the reduction dimension
- SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
- sourceType.getShape().end());
- int64_t reductionDimSize = sourceShape[reductionDim];
- // sourceShape[reductionDim] = 1;
- // VectorType modifiedSourceType =
- // VectorType::get(sourceShape, sourceType.getElementType());
-
- // print out modifiedSourceType and sourceLayout for debugging
- // LLVM_DEBUG({
- // llvm::dbgs() << "modifiedSourceType: " << modifiedSourceType << "\n";
- // llvm::dbgs() << "sourceLayout: " << sourceLayout << "\n";
- // });
-
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
if (failed(sourceDistTypeOrFailure))
@@ -1387,17 +1372,6 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
(sourceDistDim == 1 && reductionDim == 0);
- // print here all these five variables for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "sourceDistDim: " << sourceDistDim << "\n";
- llvm::dbgs() << "reductionDim: " << reductionDim << "\n";
- llvm::dbgs() << "isReductionLaneLocal: " << isReductionLaneLocal << "\n";
- llvm::dbgs() << "resultDistributed: " << resultDistributed << "\n";
- llvm::dbgs() << "sourceDistType: " << sourceDistType << "\n";
- llvm::dbgs() << "distributedResultType: " << distributedResultType
- << "\n";
- });
-
if (isReductionLaneLocal && !resultDistributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed result for lane-local reduction.");
@@ -1407,46 +1381,33 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
warpOp,
"Expecting a broadcasted result for non-lane-local reduction.");
- // Yield the source and acc vectors from the WarpOp.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
- {sourceDistType, distributedResultType}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
-
- Value result;
// Handle lane-local reduction case. In this case we fully distribute the
// reduction result.
if (isReductionLaneLocal) {
-
- result = xegpu::lowerToVectorReductions(
+ // Yield the source and acc vectors from the WarpOp.
+ SmallVector<size_t> newRetIndices;
+ auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
+ {sourceDistType, distributedResultType}, newRetIndices);
+ rewriter.setInsertionPointAfter(newWarpOp);
+ Value result = xegpu::lowerToVectorReductions(
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
- // print the reduction op for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "reductionOp1: " << *reductionOp << "\n";
- llvm::dbgs() << "lowered reduction result1: " << result << "\n";
- });
-
- } else {
- // For non-lane-local case, we simply rewrite the MultiReductionOp in
- // terms of multiple ReductionOps. Actual distribution is done by the
- // WarpOpReduction pattern.
- // rewriter.setInsertionPointAfter(reductionOp);
- result = xegpu::lowerToVectorReductionsCrossLane(
- cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
- cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
- reductionOp.getKind(), reductionDim, reductionDimSize,
- reductionOp.getLoc(), rewriter);
- // print the reduction op for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "reductionOp2: " << *reductionOp << "\n";
- llvm::dbgs() << "lowered reduction result2: " << result << "\n";
- });
+ // Replace the warp op result with the final result.
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
+ return success();
}
+ // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
+ // of multiple ReductionOps. Actual distribution is done by the
+ // WarpOpReduction pattern.
+ rewriter.setInsertionPointAfter(reductionOp);
+ Value result = xegpu::lowerToVectorReductions(
+ cast<TypedValue<VectorType>>(reductionOp.getSource()),
+ cast<TypedValue<VectorType>>(reductionOp.getAcc()),
+ reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
+ rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
return success();
}
};
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index f8d71350b7c52..fb23f38b44b46 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -283,30 +283,14 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
gpu.return
}
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK: %[[T1:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK: %[[T2:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<2x1xf32>
-// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[T2]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD:.*]] = arith.addf %[[T2]], %[[SHUFFLE]] : f32
-// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD]], %[[SHUFFLE2]] : f32
-// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
-// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
-// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[T1]] : f32
-// CHECK: %[[T8:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK: %[[T9:.*]] = vector.extract %{{.*}}#1[1, 0] : f32 from vector<2x1xf32>
-// CHECK: %[[SHUFFLE5:.*]], %{{.*}} = gpu.shuffle xor %[[T9]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD5:.*]] = arith.addf %[[T9]], %[[SHUFFLE5]] : f32
-// CHECK: %[[SHUFFLE6:.*]], %{{.*}} = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUFFLE6]] : f32
-// CHECK: %[[SHUFFLE7:.*]], %{{.*}} = gpu.shuffle xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUFFLE7]] : f32
-// CHECK: %[[SHUFFLE8:.*]], %{{.*}} = gpu.shuffle xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUFFLE8]] : f32
-// CHECK: %[[FINAL2:.*]] = arith.addf %[[ADD8]], %[[T8]] : f32
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
+// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
+// CHECK-NEXT: %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -329,6 +313,8 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
+
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
@@ -366,29 +352,27 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %[[V0:.*]] = vector.extract %{{.*}}#2[0] : f32 from vector<2xf32>
-// CHECK: %[[V1:.*]] = vector.extract %{{.*}}#1[0, 0] : f32 from vector<1x2xf32>
-// CHECK: %[[SHUF1:.*]], %[[VALID1:.*]] = gpu.shuffle xor %[[V1]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD1:.*]] = arith.addf %[[V1]], %[[SHUF1]] : f32
-// CHECK: %[[SHUF2:.*]], %[[VALID2:.*]] = gpu.shuffle xor %[[ADD1]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUF2]] : f32
-// CHECK: %[[SHUF3:.*]], %[[VALID3:.*]] = gpu.shuffle xor %[[ADD2]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUF3]] : f32
-// CHECK: %[[SHUF4:.*]], %[[VALID4:.*]] = gpu.shuffle xor %[[ADD3]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUF4]] : f32
-// CHECK: %[[RES0:.*]] = arith.addf %[[ADD4]], %[[V0]] : f32
-// CHECK: %[[V2:.*]] = vector.extract %{{.*}}#2[1] : f32 from vector<2xf32>
-// CHECK: %[[V3:.*]] = vector.extract %{{.*}}#1[0, 1] : f32 from vector<1x2xf32>
-// CHECK: %[[SHUF5:.*]], %[[VALID5:.*]] = gpu.shuffle xor %[[V3]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD5:.*]] = arith.addf %[[V3]], %[[SHUF5]] : f32
-// CHECK: %[[SHUF6:.*]], %[[VALID6:.*]] = gpu.shuffle xor %[[ADD5]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD6:.*]] = arith.addf %[[ADD5]], %[[SHUF6]] : f32
-// CHECK: %[[SHUF7:.*]], %[[VALID7:.*]] = gpu.shuffle xor %[[ADD6]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD7:.*]] = arith.addf %[[ADD6]], %[[SHUF7]] : f32
-// CHECK: %[[SHUF8:.*]], %[[VALID8:.*]] = gpu.shuffle xor %[[ADD7]], %{{.*}}, %{{.*}} : f32
-// CHECK: %[[ADD8:.*]] = arith.addf %[[ADD7]], %[[SHUF8]] : f32
-// CHECK: %[[RES1:.*]] = arith.addf %[[ADD8]], %[[V2]] : f32
-// CHECK: %[[RESULT:.*]] = vector.from_elements %[[RES0]], %[[RES1]] : vector<2xf32>
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: %[[SRC:.*]] = "some_def"()
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME: : () -> vector<16x2xf32>
+// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]]
+// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
+// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]]
+// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
+// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
>From 6f9e5b5a429b228c3492fdd0a586120157d86bfe Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 18:39:31 +0000
Subject: [PATCH 09/14] polish and remove partial-sg-size reduction test
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 12 +--
.../XeGPUSgToWiDistributeExperimental.cpp | 11 +--
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 2 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 87 +++++++++----------
4 files changed, 53 insertions(+), 59 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 91b7c2202e56b..f2cbb198b2dc2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,12 +147,12 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
vector::CombiningKind kind, int64_t reductionDim,
Location loc, PatternRewriter &rewriter);
-Value lowerToVectorReductionsCrossLane(TypedValue<VectorType> src,
- TypedValue<VectorType> acc,
- vector::CombiningKind kind,
- int64_t reductionDim,
- int64_t reductionSize, Location loc,
- PatternRewriter &rewriter);
+Value lowerCrossLaneReductionToShuffles(TypedValue<VectorType> src,
+ TypedValue<VectorType> acc,
+ vector::CombiningKind kind,
+ int64_t reductionDim,
+ int64_t reductionSize, Location loc,
+ PatternRewriter &rewriter);
/// Helper Function to find a proper instruction multiple for the user-supplied
/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b73459b0587b1..d0dc2e8df927e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -470,6 +470,10 @@ struct SgToWiMultiDimReduction
matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value result;
+ ArrayRef<int64_t> reductionDims = op.getReductionDims();
+ assert(reductionDims.size() == 1 &&
+ "Expecting single reduction dimension for subgroup multi "
+ "reduction op");
// Only lane-local reduction is handled here.
if (isReductionLaneLocal(op)) {
auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
@@ -482,10 +486,7 @@ struct SgToWiMultiDimReduction
rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
} else {
- ArrayRef<int64_t> reductionDims = op.getReductionDims();
- assert(reductionDims.size() == 1 &&
- "Expecting single reduction dimension for subgroup multi "
- "reduction op");
+
// print adaptor.getSource() and adaptor.getAcc() for debugging
LLVM_DEBUG({
llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
@@ -499,7 +500,7 @@ struct SgToWiMultiDimReduction
SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
sourceType.getShape().end());
int64_t reductionDimSize = sourceShape[reductionDim];
- result = xegpu::lowerToVectorReductionsCrossLane(
+ result = xegpu::lowerCrossLaneReductionToShuffles(
cast<TypedValue<VectorType>>(adaptor.getSource()),
cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
reductionDim, reductionDimSize, op.getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e0a1de1af45aa..a0986608bdba0 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -750,7 +750,7 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
return reductionResult;
}
-Value xegpu::lowerToVectorReductionsCrossLane(
+Value xegpu::lowerCrossLaneReductionToShuffles(
TypedValue<VectorType> src, TypedValue<VectorType> acc,
vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
Location loc, PatternRewriter &rewriter) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index abf3ddcc3d373..029cca419fa28 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -236,63 +236,56 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x1xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<1xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c4_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c4_i32_2 = arith.constant 4 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c4_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %6 = arith.addf %5, %2 : f32
-// CHECK: %7 = vector.insert %6, %cst_1 [0] : f32 into vector<1xf32>
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction_over_partial_sg_size(%laneid: index) {
- %c0 = arith.constant 0 : index
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<0.0> : vector<1x4xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<1xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
- }
- [1] : vector<1x4xf32> to vector<1xf32>
- gpu.return
-}
-
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
// CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
// CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
// CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
// CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
-// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %c16_i32 = arith.constant 16 : i32
// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle xor %5, %c1_i32, %c2_i32 : f32
+// CHECK: %shuffleResult, %valid = gpu.shuffle xor %5, %c1_i32, %c16_i32 : f32
// CHECK: %6 = arith.addf %5, %shuffleResult : f32
-// CHECK: %7 = arith.addf %6, %4 : f32
-// CHECK: %8 = vector.insert %7, %cst_0 [0] : f32 into vector<2xf32>
-// CHECK: %9 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %10 = vector.shape_cast %9 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %11 = vector.extract %cst[1] : f32 from vector<2xf32>
-// CHECK: %12 = vector.reduction <add>, %10 : vector<1xf32> into f32
-// CHECK: %c2_i32_1 = arith.constant 2 : i32
-// CHECK: %c1_i32_2 = arith.constant 1 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %12, %c1_i32_2, %c2_i32_1 : f32
-// CHECK: %13 = arith.addf %12, %shuffleResult_3 : f32
-// CHECK: %14 = arith.addf %13, %11 : f32
-// CHECK: %15 = vector.insert %14, %8 [1] : f32 into vector<2xf32>
+// CHECK: %c16_i32_1 = arith.constant 16 : i32
+// CHECK: %c2_i32 = arith.constant 2 : i32
+// CHECK: %shuffleResult_2, %valid_3 = gpu.shuffle xor %6, %c2_i32, %c16_i32_1 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_2 : f32
+// CHECK: %c16_i32_4 = arith.constant 16 : i32
+// CHECK: %c4_i32 = arith.constant 4 : i32
+// CHECK: %shuffleResult_5, %valid_6 = gpu.shuffle xor %7, %c4_i32, %c16_i32_4 : f32
+// CHECK: %8 = arith.addf %7, %shuffleResult_5 : f32
+// CHECK: %c16_i32_7 = arith.constant 16 : i32
+// CHECK: %c8_i32 = arith.constant 8 : i32
+// CHECK: %shuffleResult_8, %valid_9 = gpu.shuffle xor %8, %c8_i32, %c16_i32_7 : f32
+// CHECK: %9 = arith.addf %8, %shuffleResult_8 : f32
+// CHECK: %10 = arith.addf %9, %4 : f32
+// CHECK: %11 = vector.insert %10, %cst_0 [0] : f32 into vector<2xf32>
+// CHECK: %12 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %13 = vector.shape_cast %12 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %14 = vector.extract %cst[1] : f32 from vector<2xf32>
+// CHECK: %15 = vector.reduction <add>, %13 : vector<1xf32> into f32
+// CHECK: %c16_i32_10 = arith.constant 16 : i32
+// CHECK: %c1_i32_11 = arith.constant 1 : i32
+// CHECK: %shuffleResult_12, %valid_13 = gpu.shuffle xor %15, %c1_i32_11, %c16_i32_10 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_12 : f32
+// CHECK: %c16_i32_14 = arith.constant 16 : i32
+// CHECK: %c2_i32_15 = arith.constant 2 : i32
+// CHECK: %shuffleResult_16, %valid_17 = gpu.shuffle xor %16, %c2_i32_15, %c16_i32_14 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_16 : f32
+// CHECK: %c16_i32_18 = arith.constant 16 : i32
+// CHECK: %c4_i32_19 = arith.constant 4 : i32
+// CHECK: %shuffleResult_20, %valid_21 = gpu.shuffle xor %17, %c4_i32_19, %c16_i32_18 : f32
+// CHECK: %18 = arith.addf %17, %shuffleResult_20 : f32
+// CHECK: %c16_i32_22 = arith.constant 16 : i32
+// CHECK: %c8_i32_23 = arith.constant 8 : i32
+// CHECK: %shuffleResult_24, %valid_25 = gpu.shuffle xor %18, %c8_i32_23, %c16_i32_22 : f32
+// CHECK: %19 = arith.addf %18, %shuffleResult_24 : f32
+// CHECK: %20 = arith.addf %19, %14 : f32
+// CHECK: %21 = vector.insert %20, %11 [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%src = "some_def"()
{layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<2x2xf32>)
+ : () -> (vector<16x2xf32>)
%acc = arith.constant
{layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
dense<0.0> : vector<2xf32>
@@ -300,7 +293,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
{
layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
}
- [0] : vector<2x2xf32> to vector<2xf32>
+ [0] : vector<16x2xf32> to vector<2xf32>
gpu.return
}
>From 4853acf670b097f6be685036597a3a624307376b Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 19:02:56 +0000
Subject: [PATCH 10/14] polish
---
.../XeGPUSgToWiDistributeExperimental.cpp | 20 +----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 51 +----------
.../XeGPU/sg-to-wi-experimental-unit.mlir | 90 ++++++++++---------
3 files changed, 54 insertions(+), 107 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index d0dc2e8df927e..32c8089c7226a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -474,27 +474,17 @@ struct SgToWiMultiDimReduction
assert(reductionDims.size() == 1 &&
"Expecting single reduction dimension for subgroup multi "
"reduction op");
- // Only lane-local reduction is handled here.
if (isReductionLaneLocal(op)) {
auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
VectorType resVecTy = dyn_cast<VectorType>(op.getType());
auto resDistVecTyOrFailure =
getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
- // Simply create a new MultiDimReductionOp using adaptor operands and the
- // new result type.
+ // For lane local reduction, simply create a new MultiDimReductionOp using
+ // adaptor operands and the new result type.
result = vector::MultiDimReductionOp::create(
rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
} else {
-
- // print adaptor.getSource() and adaptor.getAcc() for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "adaptor.getSource(): " << adaptor.getSource() << "\n";
- llvm::dbgs() << "adaptor.getAcc(): " << adaptor.getAcc() << "\n";
- });
- // before get distribute vec type for source, first set its shape to be
- // unit
- // for the reduction dimension
auto reductionDim = reductionDims[0];
VectorType sourceType = op.getSourceVectorType();
SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
@@ -504,13 +494,7 @@ struct SgToWiMultiDimReduction
cast<TypedValue<VectorType>>(adaptor.getSource()),
cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
reductionDim, reductionDimSize, op.getLoc(), rewriter);
- // print the reduction op for debugging
- LLVM_DEBUG({
- llvm::dbgs() << "reductionOp3: " << *op << "\n";
- llvm::dbgs() << "lowered reduction result3: " << result << "\n";
- });
}
-
rewriter.replaceOp(op, result);
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a0986608bdba0..88e2c1a879fdd 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -121,36 +121,13 @@ xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
// dimensions are not distributed.
unsigned distributionStart =
originalType.getRank() - effectiveLaneLayout.size();
-
- // Print original shape and lane layout for debugging
- std::string shapeStr = "[";
- for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
- if (i > 0)
- shapeStr += ", ";
- shapeStr += std::to_string(dim);
- }
- shapeStr += "]";
- LDBG() << "original shape: " << shapeStr;
-
- std::string layoutStr = "[";
- for (auto [i, dim] : llvm::enumerate(effectiveLaneLayout)) {
- if (i > 0)
- layoutStr += ", ";
- layoutStr += std::to_string(dim);
- }
- layoutStr += "]";
- LDBG() << "effective lane layout: " << layoutStr;
-
for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
if (i < distributionStart)
continue;
// Check if the dimension can be distributed evenly.
- if (dim % effectiveLaneLayout[i - distributionStart] != 0) {
- assert( effectiveLaneLayout[i - distributionStart] % dim == 0 &&
- "The dimension size must be able evenly distributed to all lanes in round-robin manner.");
- distributedShape[i] = 1;
- } else
- distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+ if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
}
return VectorType::get(distributedShape, originalType.getElementType());
}
@@ -765,22 +742,6 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
- // auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
- // auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
- // // Reduction result should have the same layout as the accumulator.
- // xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
-
- // print source shape, reduction dim and reduction size for debugging
- std::string shapeStr = "[";
- for (auto [i, dim] : llvm::enumerate(sourceType.getShape())) {
- if (i > 0)
- shapeStr += ", ";
- shapeStr += std::to_string(dim);
- }
- shapeStr += "]";
- LDBG() << "source shape: " << shapeStr;
- LDBG() << "reduction dim: " << reductionDim;
- LDBG() << "reduction size: " << reductionSize;
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
@@ -794,12 +755,6 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
sliceSizes = {sourceH, 1};
}
- // print src, sliceOffsets, sliceSizes for debugging
- LDBG() << "src: " << src;
- LDBG() << "sliceOffsets: [" << sliceOffsets[0] << ", " << sliceOffsets[1]
- << "]";
- LDBG() << "sliceSizes: [" << sliceSizes[0] << ", " << sliceSizes[1] << "]";
-
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 029cca419fa28..a87a67467d7fc 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -176,6 +176,10 @@ gpu.func @vector_reduction() {
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK: %c0 = arith.constant 0 : index
+// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<2x1xf32>
+// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
@@ -237,55 +241,59 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %2 = vector.extract_strided_slice %1 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %3 = vector.shape_cast %2 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %4 = vector.extract %cst[0] : f32 from vector<2xf32>
-// CHECK: %5 = vector.reduction <add>, %3 : vector<1xf32> into f32
+// CHECK: %c0 = arith.constant 0 : index
+// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<1x2xf32>
+// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
+// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
// CHECK: %c16_i32 = arith.constant 16 : i32
// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle xor %5, %c1_i32, %c16_i32 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult : f32
-// CHECK: %c16_i32_1 = arith.constant 16 : i32
+// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c16_i32 : f32
+// CHECK: %4 = arith.addf %3, %shuffleResult : f32
+// CHECK: %c16_i32_2 = arith.constant 16 : i32
// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_2, %valid_3 = gpu.shuffle xor %6, %c2_i32, %c16_i32_1 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_2 : f32
-// CHECK: %c16_i32_4 = arith.constant 16 : i32
+// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c16_i32_2 : f32
+// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
+// CHECK: %c16_i32_5 = arith.constant 16 : i32
// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_5, %valid_6 = gpu.shuffle xor %7, %c4_i32, %c16_i32_4 : f32
-// CHECK: %8 = arith.addf %7, %shuffleResult_5 : f32
-// CHECK: %c16_i32_7 = arith.constant 16 : i32
+// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle xor %5, %c4_i32, %c16_i32_5 : f32
+// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
+// CHECK: %c16_i32_8 = arith.constant 16 : i32
// CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_8, %valid_9 = gpu.shuffle xor %8, %c8_i32, %c16_i32_7 : f32
-// CHECK: %9 = arith.addf %8, %shuffleResult_8 : f32
-// CHECK: %10 = arith.addf %9, %4 : f32
-// CHECK: %11 = vector.insert %10, %cst_0 [0] : f32 into vector<2xf32>
-// CHECK: %12 = vector.extract_strided_slice %1 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %13 = vector.shape_cast %12 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %14 = vector.extract %cst[1] : f32 from vector<2xf32>
-// CHECK: %15 = vector.reduction <add>, %13 : vector<1xf32> into f32
-// CHECK: %c16_i32_10 = arith.constant 16 : i32
-// CHECK: %c1_i32_11 = arith.constant 1 : i32
-// CHECK: %shuffleResult_12, %valid_13 = gpu.shuffle xor %15, %c1_i32_11, %c16_i32_10 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_12 : f32
-// CHECK: %c16_i32_14 = arith.constant 16 : i32
-// CHECK: %c2_i32_15 = arith.constant 2 : i32
-// CHECK: %shuffleResult_16, %valid_17 = gpu.shuffle xor %16, %c2_i32_15, %c16_i32_14 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_16 : f32
-// CHECK: %c16_i32_18 = arith.constant 16 : i32
-// CHECK: %c4_i32_19 = arith.constant 4 : i32
-// CHECK: %shuffleResult_20, %valid_21 = gpu.shuffle xor %17, %c4_i32_19, %c16_i32_18 : f32
-// CHECK: %18 = arith.addf %17, %shuffleResult_20 : f32
-// CHECK: %c16_i32_22 = arith.constant 16 : i32
-// CHECK: %c8_i32_23 = arith.constant 8 : i32
-// CHECK: %shuffleResult_24, %valid_25 = gpu.shuffle xor %18, %c8_i32_23, %c16_i32_22 : f32
-// CHECK: %19 = arith.addf %18, %shuffleResult_24 : f32
-// CHECK: %20 = arith.addf %19, %14 : f32
-// CHECK: %21 = vector.insert %20, %11 [1] : f32 into vector<2xf32>
+// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle xor %6, %c8_i32, %c16_i32_8 : f32
+// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
+// CHECK: %8 = arith.addf %7, %2 : f32
+// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
+// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
+// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
+// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
+// CHECK: %c16_i32_11 = arith.constant 16 : i32
+// CHECK: %c1_i32_12 = arith.constant 1 : i32
+// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle xor %13, %c1_i32_12, %c16_i32_11 : f32
+// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
+// CHECK: %c16_i32_15 = arith.constant 16 : i32
+// CHECK: %c2_i32_16 = arith.constant 2 : i32
+// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle xor %14, %c2_i32_16, %c16_i32_15 : f32
+// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
+// CHECK: %c16_i32_19 = arith.constant 16 : i32
+// CHECK: %c4_i32_20 = arith.constant 4 : i32
+// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle xor %15, %c4_i32_20, %c16_i32_19 : f32
+// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
+// CHECK: %c16_i32_23 = arith.constant 16 : i32
+// CHECK: %c8_i32_24 = arith.constant 8 : i32
+// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle xor %16, %c8_i32_24, %c16_i32_23 : f32
+// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
+// CHECK: %18 = arith.addf %17, %12 : f32
+// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = "some_def"()
+ %src = arith.constant
{layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<16x2xf32>)
+ dense<0.0> : vector<16x2xf32>
%acc = arith.constant
{layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
dense<0.0> : vector<2xf32>
>From 0a2337075ebe5adc29fd43386a44c1aacffa0d85 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 21 Feb 2026 19:24:19 +0000
Subject: [PATCH 11/14] polish legality check
---
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 32c8089c7226a..7ecc1e3128d02 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -699,11 +699,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
// vector::MultiDimReductionOp op legality.
target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
[=](vector::MultiDimReductionOp op) -> bool {
- // Check common conditions for subgroup multi reduction op.
- if (!isValidSubgroupMultiReductionOp(op))
- return true;
- // Lane local reductions are illegal at this point and must be lowered.
- return false; // !isReductionLaneLocal(op);
+ return !isValidSubgroupMultiReductionOp(op);
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
>From 5f35d52413950d4dec7290c77b4c906d04ff10b9 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:16:49 +0000
Subject: [PATCH 12/14] address feedback and fix test issue
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 8 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 192 +++++++++---------
2 files changed, 103 insertions(+), 97 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 88e2c1a879fdd..63af57335595d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -736,13 +736,19 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
VectorType sourceType = src.getType();
int64_t sourceH = sourceType.getShape()[0];
int64_t sourceW = sourceType.getShape()[1];
- int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+
// Create a constant vector to hold the result of the reduction.
TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
+ // nSlices is the number of reduction operations needed to reduce the entire
+ // source vector. For example, if reductionDim is 0, we are reducing across
+ // rows, and each slice is a column of the source vector. So the number of
+ // slices is the number of columns, which is sourceW.
+ int nSlices = (reductionDim == 0) ? sourceW : sourceH;
+
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a87a67467d7fc..450aa2cf6df05 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -176,54 +176,54 @@ gpu.func @vector_reduction() {
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK: %c0 = arith.constant 0 : index
-// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<2x1xf32>
-// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c16_i32 = arith.constant 16 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c16_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c16_i32_2 = arith.constant 16 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c16_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %c16_i32_5 = arith.constant 16 : i32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle xor %5, %c4_i32, %c16_i32_5 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
-// CHECK: %c16_i32_8 = arith.constant 16 : i32
-// CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle xor %6, %c8_i32, %c16_i32_8 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
-// CHECK: %8 = arith.addf %7, %2 : f32
-// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
-// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
-// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
-// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
-// CHECK: %c16_i32_11 = arith.constant 16 : i32
-// CHECK: %c1_i32_12 = arith.constant 1 : i32
-// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle xor %13, %c1_i32_12, %c16_i32_11 : f32
-// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
-// CHECK: %c16_i32_15 = arith.constant 16 : i32
-// CHECK: %c2_i32_16 = arith.constant 2 : i32
-// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle xor %14, %c2_i32_16, %c16_i32_15 : f32
-// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
-// CHECK: %c16_i32_19 = arith.constant 16 : i32
-// CHECK: %c4_i32_20 = arith.constant 4 : i32
-// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle xor %15, %c4_i32_20, %c16_i32_19 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
-// CHECK: %c16_i32_23 = arith.constant 16 : i32
-// CHECK: %c8_i32_24 = arith.constant 8 : i32
-// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle xor %16, %c8_i32_24, %c16_i32_23 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
-// CHECK: %18 = arith.addf %17, %12 : f32
-// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<2x1xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[V0:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %[[V1:.*]] = vector.shape_cast %[[V0]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V2:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<2xf32>
+// CHECK: %[[V3:.*]] = vector.reduction <add>, %[[V1]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
+// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
+// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
+// CHECK: %[[V11:.*]] = vector.shape_cast %[[V10]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V12:.*]] = vector.extract %[[CST_0]][1] : f32 from vector<2xf32>
+// CHECK: %[[V13:.*]] = vector.reduction <add>, %[[V11]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
+// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%src = arith.constant
@@ -241,54 +241,54 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %c0 = arith.constant 0 : index
-// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<1x2xf32>
-// CHECK: %cst_0 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %cst_1 = arith.constant dense<0.000000e+00> : vector<2xf32>
-// CHECK: %0 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %1 = vector.shape_cast %0 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %2 = vector.extract %cst_0[0] : f32 from vector<2xf32>
-// CHECK: %3 = vector.reduction <add>, %1 : vector<1xf32> into f32
-// CHECK: %c16_i32 = arith.constant 16 : i32
-// CHECK: %c1_i32 = arith.constant 1 : i32
-// CHECK: %shuffleResult, %valid = gpu.shuffle xor %3, %c1_i32, %c16_i32 : f32
-// CHECK: %4 = arith.addf %3, %shuffleResult : f32
-// CHECK: %c16_i32_2 = arith.constant 16 : i32
-// CHECK: %c2_i32 = arith.constant 2 : i32
-// CHECK: %shuffleResult_3, %valid_4 = gpu.shuffle xor %4, %c2_i32, %c16_i32_2 : f32
-// CHECK: %5 = arith.addf %4, %shuffleResult_3 : f32
-// CHECK: %c16_i32_5 = arith.constant 16 : i32
-// CHECK: %c4_i32 = arith.constant 4 : i32
-// CHECK: %shuffleResult_6, %valid_7 = gpu.shuffle xor %5, %c4_i32, %c16_i32_5 : f32
-// CHECK: %6 = arith.addf %5, %shuffleResult_6 : f32
-// CHECK: %c16_i32_8 = arith.constant 16 : i32
-// CHECK: %c8_i32 = arith.constant 8 : i32
-// CHECK: %shuffleResult_9, %valid_10 = gpu.shuffle xor %6, %c8_i32, %c16_i32_8 : f32
-// CHECK: %7 = arith.addf %6, %shuffleResult_9 : f32
-// CHECK: %8 = arith.addf %7, %2 : f32
-// CHECK: %9 = vector.insert %8, %cst_1 [0] : f32 into vector<2xf32>
-// CHECK: %10 = vector.extract_strided_slice %cst {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
-// CHECK: %11 = vector.shape_cast %10 : vector<1x1xf32> to vector<1xf32>
-// CHECK: %12 = vector.extract %cst_0[1] : f32 from vector<2xf32>
-// CHECK: %13 = vector.reduction <add>, %11 : vector<1xf32> into f32
-// CHECK: %c16_i32_11 = arith.constant 16 : i32
-// CHECK: %c1_i32_12 = arith.constant 1 : i32
-// CHECK: %shuffleResult_13, %valid_14 = gpu.shuffle xor %13, %c1_i32_12, %c16_i32_11 : f32
-// CHECK: %14 = arith.addf %13, %shuffleResult_13 : f32
-// CHECK: %c16_i32_15 = arith.constant 16 : i32
-// CHECK: %c2_i32_16 = arith.constant 2 : i32
-// CHECK: %shuffleResult_17, %valid_18 = gpu.shuffle xor %14, %c2_i32_16, %c16_i32_15 : f32
-// CHECK: %15 = arith.addf %14, %shuffleResult_17 : f32
-// CHECK: %c16_i32_19 = arith.constant 16 : i32
-// CHECK: %c4_i32_20 = arith.constant 4 : i32
-// CHECK: %shuffleResult_21, %valid_22 = gpu.shuffle xor %15, %c4_i32_20, %c16_i32_19 : f32
-// CHECK: %16 = arith.addf %15, %shuffleResult_21 : f32
-// CHECK: %c16_i32_23 = arith.constant 16 : i32
-// CHECK: %c8_i32_24 = arith.constant 8 : i32
-// CHECK: %shuffleResult_25, %valid_26 = gpu.shuffle xor %16, %c8_i32_24, %c16_i32_23 : f32
-// CHECK: %17 = arith.addf %16, %shuffleResult_25 : f32
-// CHECK: %18 = arith.addf %17, %12 : f32
-// CHECK: %19 = vector.insert %18, %9 [1] : f32 into vector<2xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32>
+// CHECK: %[[V0:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %[[V1:.*]] = vector.shape_cast %[[V0]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V2:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<2xf32>
+// CHECK: %[[V3:.*]] = vector.reduction <add>, %[[V1]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
+// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
+// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
+// CHECK: %[[V11:.*]] = vector.shape_cast %[[V10]] : vector<1x1xf32> to vector<1xf32>
+// CHECK: %[[V12:.*]] = vector.extract %[[CST_0]][1] : f32 from vector<2xf32>
+// CHECK: %[[V13:.*]] = vector.reduction <add>, %[[V11]] : vector<1xf32> into f32
+// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
+// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
+// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
+// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
+// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
+// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
+// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
+// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
+// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%src = arith.constant
>From eb81bcb1bc6cc9da34ccc0a5acf625ac70af692e Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:30:51 +0000
Subject: [PATCH 13/14] address feedback
---
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 4 ++++
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 4 +---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 7 -------
3 files changed, 5 insertions(+), 10 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index f2cbb198b2dc2..e7cae506d9f4e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,6 +147,10 @@ Value lowerToVectorReductions(TypedValue<VectorType> src,
vector::CombiningKind kind, int64_t reductionDim,
Location loc, PatternRewriter &rewriter);
+/// Lowers cross-lane reductions to shuffle operations on a 2D vector.
+/// Extracts slices along the reduction dimension, performs subgroup reductions
+/// with shuffles across reductionSize work-items, and inserts the results back
+/// into an accumulator vector.
Value lowerCrossLaneReductionToShuffles(TypedValue<VectorType> src,
TypedValue<VectorType> acc,
vector::CombiningKind kind,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 7ecc1e3128d02..32ead1867aa23 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -487,9 +487,7 @@ struct SgToWiMultiDimReduction
} else {
auto reductionDim = reductionDims[0];
VectorType sourceType = op.getSourceVectorType();
- SmallVector<int64_t, 2> sourceShape(sourceType.getShape().begin(),
- sourceType.getShape().end());
- int64_t reductionDimSize = sourceShape[reductionDim];
+ int64_t reductionDimSize = sourceType.getShape()[reductionDim];
result = xegpu::lowerCrossLaneReductionToShuffles(
cast<TypedValue<VectorType>>(adaptor.getSource()),
cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 63af57335595d..3271e73e0b571 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -764,24 +764,17 @@ Value xegpu::lowerCrossLaneReductionToShuffles(
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
-
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
-
vector::ShapeCastOp slice = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
- // Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
-
- // Distribute and reduce across work-items in the subgroup.
Value fullReduce =
xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
-
fullReduce =
vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
-
reductionResult =
vector::InsertOp::create(rewriter, loc, fullReduce, reductionResult, i);
}
>From e67872e34b6a1dec6210d55f02f25b5c5089b87c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 24 Feb 2026 22:50:37 +0000
Subject: [PATCH 14/14] fix test
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 40 +++++++++----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 450aa2cf6df05..2225cfb4021cb 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -154,19 +154,19 @@ gpu.func @prefetch_nd() {
// CHECK-DAG: %[[C16_1:.*]] = arith.constant 16 : i32
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
// CHECK: %[[SHUFFLE1:.*]], %{{.*}} = gpu.shuffle xor %[[LANE_RED]], %[[C1]], %[[C16_1]] : f32
-// CHECK: %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1]] : f32
+// CHECK: %[[ADD1:.*]] = arith.addf %[[LANE_RED]], %[[SHUFFLE1:.*]] : f32
// CHECK-DAG: %[[C16_2:.*]] = arith.constant 16 : i32
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
// CHECK: %[[SHUFFLE2:.*]], %{{.*}} = gpu.shuffle xor %[[ADD1]], %[[C2]], %[[C16_2]] : f32
-// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2]] : f32
+// CHECK: %[[ADD2:.*]] = arith.addf %[[ADD1]], %[[SHUFFLE2:.*]] : f32
// CHECK-DAG: %[[C16_3:.*]] = arith.constant 16 : i32
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
// CHECK: %[[SHUFFLE3:.*]], %{{.*}} = gpu.shuffle xor %[[ADD2]], %[[C4]], %[[C16_3]] : f32
-// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3]] : f32
+// CHECK: %[[ADD3:.*]] = arith.addf %[[ADD2]], %[[SHUFFLE3:.*]] : f32
// CHECK-DAG: %[[C16_4:.*]] = arith.constant 16 : i32
// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
// CHECK: %[[SHUFFLE4:.*]], %{{.*}} = gpu.shuffle xor %[[ADD3]], %[[C8]], %[[C16_4]] : f32
-// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4]] : f32
+// CHECK: %[[ADD4:.*]] = arith.addf %[[ADD3]], %[[SHUFFLE4:.*]] : f32
// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
gpu.func @vector_reduction() {
%acc = arith.constant 1.0 : f32
@@ -187,19 +187,19 @@ gpu.func @vector_reduction() {
// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
-// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE:.*]] : f32
// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
-// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2:.*]] : f32
// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
-// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3:.*]] : f32
// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
-// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4:.*]] : f32
// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [1, 0], sizes = [1, 1], strides = [1, 1]} : vector<2x1xf32> to vector<1x1xf32>
@@ -209,19 +209,19 @@ gpu.func @vector_reduction() {
// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
-// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5:.*]] : f32
// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
-// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6:.*]] : f32
// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
-// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7:.*]] : f32
// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
-// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8:.*]] : f32
// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
@@ -252,19 +252,19 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
// CHECK: %[[C16_I32:.*]] = arith.constant 16 : i32
// CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
// CHECK: %[[SHUFFLE:.*]], %{{.*}} = gpu.shuffle xor %[[V3]], %[[C1_I32]], %[[C16_I32]] : f32
-// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE]] : f32
+// CHECK: %[[V4:.*]] = arith.addf %[[V3]], %[[SHUFFLE:.*]] : f32
// CHECK: %[[C16_I32_2:.*]] = arith.constant 16 : i32
// CHECK: %[[C2_I32:.*]] = arith.constant 2 : i32
// CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[V4]], %[[C2_I32]], %[[C16_I32_2]] : f32
-// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2]] : f32
+// CHECK: %[[V5:.*]] = arith.addf %[[V4]], %[[SHUFFLE_2:.*]] : f32
// CHECK: %[[C16_I32_3:.*]] = arith.constant 16 : i32
// CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[V5]], %[[C4_I32]], %[[C16_I32_3]] : f32
-// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3]] : f32
+// CHECK: %[[V6:.*]] = arith.addf %[[V5]], %[[SHUFFLE_3:.*]] : f32
// CHECK: %[[C16_I32_4:.*]] = arith.constant 16 : i32
// CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
// CHECK: %[[SHUFFLE_4:.*]], %{{.*}} = gpu.shuffle xor %[[V6]], %[[C8_I32]], %[[C16_I32_4]] : f32
-// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4]] : f32
+// CHECK: %[[V7:.*]] = arith.addf %[[V6]], %[[SHUFFLE_4:.*]] : f32
// CHECK: %[[V8:.*]] = arith.addf %[[V7]], %[[V2]] : f32
// CHECK: %[[V9:.*]] = vector.insert %[[V8]], %[[CST_1]] [0] : f32 into vector<2xf32>
// CHECK: %[[V10:.*]] = vector.extract_strided_slice %[[CST]] {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x2xf32> to vector<1x1xf32>
@@ -274,19 +274,19 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
// CHECK: %[[C16_I32_5:.*]] = arith.constant 16 : i32
// CHECK: %[[C1_I32_2:.*]] = arith.constant 1 : i32
// CHECK: %[[SHUFFLE_5:.*]], %{{.*}} = gpu.shuffle xor %[[V13]], %[[C1_I32_2]], %[[C16_I32_5]] : f32
-// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5]] : f32
+// CHECK: %[[V14:.*]] = arith.addf %[[V13]], %[[SHUFFLE_5:.*]] : f32
// CHECK: %[[C16_I32_6:.*]] = arith.constant 16 : i32
// CHECK: %[[C2_I32_2:.*]] = arith.constant 2 : i32
// CHECK: %[[SHUFFLE_6:.*]], %{{.*}} = gpu.shuffle xor %[[V14]], %[[C2_I32_2]], %[[C16_I32_6]] : f32
-// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6]] : f32
+// CHECK: %[[V15:.*]] = arith.addf %[[V14]], %[[SHUFFLE_6:.*]] : f32
// CHECK: %[[C16_I32_7:.*]] = arith.constant 16 : i32
// CHECK: %[[C4_I32_2:.*]] = arith.constant 4 : i32
// CHECK: %[[SHUFFLE_7:.*]], %{{.*}} = gpu.shuffle xor %[[V15]], %[[C4_I32_2]], %[[C16_I32_7]] : f32
-// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7]] : f32
+// CHECK: %[[V16:.*]] = arith.addf %[[V15]], %[[SHUFFLE_7:.*]] : f32
// CHECK: %[[C16_I32_8:.*]] = arith.constant 16 : i32
// CHECK: %[[C8_I32_2:.*]] = arith.constant 8 : i32
// CHECK: %[[SHUFFLE_8:.*]], %{{.*}} = gpu.shuffle xor %[[V16]], %[[C8_I32_2]], %[[C16_I32_8]] : f32
-// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8]] : f32
+// CHECK: %[[V17:.*]] = arith.addf %[[V16]], %[[SHUFFLE_8:.*]] : f32
// CHECK: %[[V18:.*]] = arith.addf %[[V17]], %[[V12]] : f32
// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
More information about the Mlir-commits
mailing list