[Mlir-commits] [mlir] [MLIR][XeGPU] Add 2D `vector.multi_reduction` optimization (PR #171154)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Dec 8 08:36:10 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Artem Kroviakov (akroviakov)
<details>
<summary>Changes</summary>
This PR adds an optimization transformation of a 2D vector multi reduction. A 2D reduction is rewritten as two 1D reductions (non-communicating dim first, if possible) to later be consumed by a distribution pattern.
---
Full diff: https://github.com/llvm/llvm-project/pull/171154.diff
2 Files Affected:
- (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp (+131-1)
- (added) mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir (+85)
``````````diff
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index ab41fe4298d99..238599e21f65a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -416,12 +416,131 @@ class VectorExtractOpPattern final
}
};
+class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
+ using OpConversionPattern::OpConversionPattern;
+ LogicalResult
+ matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (reductionOp.getReductionDims().size() != 2)
+ return rewriter.notifyMatchFailure(reductionOp,
+ "Expected 2D multi reduction");
+
+ auto layout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
+
+ auto dims = llvm::to_vector(reductionOp.getReductionDims());
+ auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, layout);
+ // Order does not matter
+ if (intraLaneDim == -1 || crossLaneDim == -1) {
+ intraLaneDim = dims[0];
+ crossLaneDim = dims[1];
+ }
+ auto loc = reductionOp.getLoc();
+ // XeGPU transforms expect vector types
+ auto sourceVecType = reductionOp.getSourceVectorType();
+ auto acc = reductionOp.getAcc();
+ bool scalarAcc = !isa<VectorType>(acc.getType());
+ if (scalarAcc)
+ acc = vector::FromElementsOp::create(
+ rewriter, loc, VectorType::get({1}, sourceVecType.getElementType()),
+ acc);
+
+ // Preserve layout in the intermediate reduction (apart from the reduced
+ // dim)
+ auto sourceSliceLayoutAttr = cast<xegpu::SliceAttr>(layout);
+ SmallVector<int64_t> sliceDims{
+ sourceSliceLayoutAttr.getDims().asArrayRef()};
+ auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), crossLaneDim);
+ assert(foundIt != sliceDims.end() &&
+ "Expected to find reduction dim in slice dims");
+ sliceDims.erase(foundIt);
+ auto intraLaneLayout = xegpu::SliceAttr::get(
+ reductionOp.getContext(), sourceSliceLayoutAttr.getParent(),
+ DenseI64ArrayAttr::get(getContext(), sliceDims));
+
+ // First we do intra-lane reduction
+ SmallVector<int64_t> accShape(sourceVecType.getShape());
+ accShape.erase(accShape.begin() + intraLaneDim);
+ // Add a dim to the lower-dim user-supplied acc
+ Value firstRedAcc = acc;
+ if (firstRedAcc) {
+ firstRedAcc = vector::BroadcastOp::create(
+ rewriter, loc,
+ VectorType::get(accShape, sourceVecType.getElementType()), acc);
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(firstRedAcc),
+ cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+ }
+ Value intraLaneReduced = vector::MultiDimReductionOp::create(
+ rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
+ firstRedAcc, ArrayRef<int64_t>(intraLaneDim));
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(intraLaneReduced),
+ cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+
+ // For scalar results, add a unit dim where intra lane dim was
+ if (scalarAcc) {
+ SmallVector<int64_t> vecTypeWithUnitDim{sourceVecType.getShape()};
+ vecTypeWithUnitDim[intraLaneDim] = 1;
+ intraLaneReduced = vector::ShapeCastOp::create(
+ rewriter, loc,
+ VectorType::get(vecTypeWithUnitDim, sourceVecType.getElementType()),
+ intraLaneReduced);
+ // Layout matches last reduction
+ xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(intraLaneReduced),
+ layout);
+ } else
+ crossLaneDim -= static_cast<int64_t>(intraLaneDim < crossLaneDim);
+ // Do cross-lane reduction
+ Value crossLaneReduced = vector::MultiDimReductionOp::create(
+ rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
+ ArrayRef<int64_t>(crossLaneDim));
+ xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(crossLaneReduced),
+ layout);
+
+ if (scalarAcc)
+ crossLaneReduced =
+ vector::ExtractOp::create(rewriter, loc, crossLaneReduced, 0);
+ assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
+ "Type mismatch");
+ rewriter.replaceOp(reductionOp, crossLaneReduced);
+ return success();
+ }
+
+private:
+ std::pair<int64_t, int64_t>
+ getReductionDimOrder(ArrayRef<int64_t> reductionDims,
+ xegpu::DistributeLayoutAttr layout) const {
+ assert(layout.isForSubgroup() && "Must know the lane layout");
+ assert(reductionDims.size() == 2 && "Expected 2D reduction");
+ int64_t intra, cross = -1;
+ xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
+ if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout)) {
+ while (dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent()))
+ layoutSliceAttr =
+ dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent());
+ layoutAttr = dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.getParent());
+ }
+ assert(layoutAttr);
+ SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
+
+ assert(laneLayout.size() && "Expected a non-empty layout");
+ // try to pick a dim that does not communicate
+ for (auto dim : reductionDims) {
+ if (laneLayout[dim] == 1)
+ intra = dim;
+ else
+ cross = dim;
+ }
+ return {intra, cross};
+ }
+};
+
} // namespace
void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
RewritePatternSet &patterns) {
patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
- VectorExtractOpPattern>(patterns.getContext());
+ VectorExtractOpPattern, MultiRed2dOp>(patterns.getContext());
}
namespace {
@@ -472,6 +591,17 @@ struct XeGPUOptimizeBlockLoadsPass final
auto laneData = layout.getEffectiveLaneDataAsInt();
return !canBeOptimizedForTranspose(laneLayout, laneData);
});
+
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [=](Operation *op) -> bool {
+ auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+ if (!layout || !layout.isForSubgroup())
+ return true;
+ if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
+ return reductionOp.getReductionDims().size() != 2;
+ return true;
+ });
+
converter.addConversion([](Type type) { return type; });
target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
new file mode 100644
index 0000000000000..754825193a10f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
+// RUN: --xegpu-optimize-block-loads --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
+// CHECK: %[[ACC_VEC:.*]] = vector.from_elements %[[ACC]] : vector<1xf32>
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.shape_cast %[[LOADED_REDUCED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<16xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED_FOR_CROSS]], %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [1] : vector<1x16xf32> to vector<1xf32>
+// CHECK: %[[SCALAR_RES:.*]] = vector.extract %[[LOADED_REDUCED_2D]][0] : f32 from vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.0 : f32
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+ -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ %load = xegpu.load_nd %tdesc[0, 0]
+ : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ -> vector<4x16xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [0, 1]
+ : vector<4x16xf32> to f32
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} : vector<1xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} [1] : vector<1x4x16xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1] : vector<1x16xf32> to vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+ -> !xegpu.tensor_desc<4x16xf32>
+ %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+ %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32>
+ %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1,1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1, 2]
+ : vector<1x4x16xf32> to vector<1xf32>
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x64xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x64xf32> -> !xegpu.tensor_desc<1x64xf32>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
+// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} : vector<1xf32> to vector<1x64xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} [1] : vector<1x1x64xf32> to vector<1x64xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1] : vector<1x64xf32> to vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x64xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x64xf32>
+ -> !xegpu.tensor_desc<1x64xf32>
+ %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
+ %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
+ %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2]
+ : vector<1x1x64xf32> to vector<1xf32>
+ gpu.return
+ }
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/171154
More information about the Mlir-commits
mailing list