[Mlir-commits] [mlir] 8378b6d - [MLIR][XeGPU] Lower vector.multi_reduction to vector.reduction for lane local (#191037)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Apr 10 20:58:30 PDT 2026
Author: Nishant Patel
Date: 2026-04-10T20:58:25-07:00
New Revision: 8378b6d51ee611a805c432b925ed2101400eaba2
URL: https://github.com/llvm/llvm-project/commit/8378b6d51ee611a805c432b925ed2101400eaba2
DIFF: https://github.com/llvm/llvm-project/commit/8378b6d51ee611a805c432b925ed2101400eaba2.diff
LOG: [MLIR][XeGPU] Lower vector.multi_reduction to vector.reduction for lane local (#191037)
Added:
Modified:
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index e3227c7f5b149..b086a6571ddb4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -590,15 +590,14 @@ struct SgToWiMultiDimReduction
result = vector::makeArithReduction(rewriter, op.getLoc(), op.getKind(),
result, adaptor.getAcc());
} else if (isReductionLaneLocal(op)) {
- auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
- VectorType resVecTy = dyn_cast<VectorType>(op.getType());
- auto resDistVecTyOrFailure =
- getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
- // For lane local reduction, simply create a new MultiDimReductionOp using
- // adaptor operands and the new result type.
- result = vector::MultiDimReductionOp::create(
- rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
- adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
+ // For lane-local reduction, lower to a sequence of vector.reduction ops
+ // over 1D slices extracted from the distributed source vector. This is
+ // required so we dont have 2D source vectors at xegpu-linearize.
+ auto reductionDim = reductionDims[0];
+ result = xegpu::lowerToVectorReductions(
+ cast<TypedValue<VectorType>>(adaptor.getSource()),
+ cast<TypedValue<VectorType>>(adaptor.getAcc()), op.getKind(),
+ reductionDim, op.getLoc(), rewriter);
} else {
auto reductionDim = reductionDims[0];
VectorType sourceType = op.getSourceVectorType();
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index f0508a30621f2..e83f96bb294a9 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -690,6 +690,8 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
+ // TODO: Remove these get/setTemporaryLayout calls after we deprecate the old
+ // XeGPUSubgroupDistribute pass.
auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
// Reduction result should have the same layout as the accumulator.
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 0335105ebe7f0..4c3727388831b 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -432,9 +432,13 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4x1xf32>
-// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST]], %[[CST_0]] [0] : vector<4x1xf32> to vector<1xf32>
+// CHECK-DAG: %[[SRC:.*]] = arith.constant dense<0.000000e+00> : vector<4x1xf32>
+// CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[SLICE:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x1xf32> to vector<4x1xf32>
+// CHECK: %[[FLAT:.*]] = vector.shape_cast %[[SLICE]] : vector<4x1xf32> to vector<4xf32>
+// CHECK: %[[ACC_EL:.*]] = vector.extract %[[ACC]][0] : f32 from vector<1xf32>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[FLAT]], %[[ACC_EL]] : vector<4xf32> into f32
+// CHECK: vector.insert %[[RED]], %{{.*}} [0] : f32 into vector<1xf32>
// CHECK: gpu.return
gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
@@ -453,9 +457,13 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
}
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x12xf32>
-// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST]], %[[CST_0]] [1] : vector<1x12xf32> to vector<1xf32>
+// CHECK-DAG: %[[SRC:.*]] = arith.constant dense<0.000000e+00> : vector<1x12xf32>
+// CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[SLICE:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [1, 12], strides = [1, 1]} : vector<1x12xf32> to vector<1x12xf32>
+// CHECK: %[[FLAT:.*]] = vector.shape_cast %[[SLICE]] : vector<1x12xf32> to vector<12xf32>
+// CHECK: %[[ACC_EL:.*]] = vector.extract %[[ACC]][0] : f32 from vector<1xf32>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[FLAT]], %[[ACC_EL]] : vector<12xf32> into f32
+// CHECK: vector.insert %[[RED]], %{{.*}} [0] : f32 into vector<1xf32>
// CHECK: gpu.return
gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
@@ -582,9 +590,18 @@ gpu.func @constant_mask_2d() {
// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x16x2xf32>
-// CHECK: %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
-// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST]], %[[CST_0]] [1] : vector<1x16x2xf32> to vector<1x2xf32>
+// CHECK-DAG: %[[SRC:.*]] = arith.constant dense<0.000000e+00> : vector<1x16x2xf32>
+// CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
+// CHECK: %[[S0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0, 0], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
+// CHECK: %[[F0:.*]] = vector.shape_cast %[[S0]] : vector<1x16x1xf32> to vector<16xf32>
+// CHECK: %[[A0:.*]] = vector.extract %[[ACC]][0, 0] : f32 from vector<1x2xf32>
+// CHECK: %[[R0:.*]] = vector.reduction <add>, %[[F0]], %[[A0]] : vector<16xf32> into f32
+// CHECK: %[[I0:.*]] = vector.insert %[[R0]], %{{.*}} [0, 0] : f32 into vector<1x2xf32>
+// CHECK: %[[S1:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0, 1], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
+// CHECK: %[[F1:.*]] = vector.shape_cast %[[S1]] : vector<1x16x1xf32> to vector<16xf32>
+// CHECK: %[[A1:.*]] = vector.extract %[[ACC]][0, 1] : f32 from vector<1x2xf32>
+// CHECK: %[[R1:.*]] = vector.reduction <add>, %[[F1]], %[[A1]] : vector<16xf32> into f32
+// CHECK: vector.insert %[[R1]], %[[I0]] [0, 1] : f32 into vector<1x2xf32>
// CHECK: gpu.return
gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local() {
%src = arith.constant
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index 9febd79c7adc3..babb01c131792 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -445,7 +445,8 @@ gpu.module @xevm_module{
// -----
// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
-// CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : vector<1xf16> to vector<16xf16>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %{{.*}}, %{{.*}} : vector<16xf16> into f16
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[RED]] : f16 to vector<16xf16>
gpu.module @xevm_module{
gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
%c0 = arith.constant 0 : index
More information about the Mlir-commits
mailing list