[Mlir-commits] [mlir] [MLIR][XeGPU] Add 2D `vector.multi_reduction` optimization (PR #171154)
Artem Kroviakov
llvmlistbot at llvm.org
Wed Jan 7 05:53:59 PST 2026
https://github.com/akroviakov updated https://github.com/llvm/llvm-project/pull/171154
>From d7564a36639a7141954e33c87f390ed9c00adca0 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Fri, 5 Dec 2025 16:08:04 +0000
Subject: [PATCH 01/10] [MLIR][XeGPU] Add sg layout propagation
---
.../Dialect/XeGPU/propagate-layout-sg.mlir | 53 +++++++++++++++++++
1 file changed, 53 insertions(+)
create mode 100644 mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
new file mode 100644
index 0000000000000..5659e9995b22a
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=sg" -split-input-file %s | FileCheck %s
+
+gpu.module @test {
+ // CHECK-LABEL: store_nd
+ // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+ func.func @store_nd(%src: memref<256x128xf32>) {
+ // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32>
+ // CHECK-SAME: -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}>
+ // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
+ // CHECK-SAME: : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ // CHECK-SAME: -> vector<256x128xf32>
+ // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}>
+ // CHECK-SAME: : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32>
+ %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32>
+ xegpu.store_nd %load, %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
+ : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
+ return
+ }
+}
+
+// -----
+
+gpu.module @test {
+ // CHECK-LABEL: vector_transpose
+ // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+ // CHECK-SAME: %[[ARG_1:.*]]: memref<128x256xf32>
+ func.func @vector_transpose(%src: memref<256x128xf32>, %src1: memref<128x256xf32>) {
+ // CHECK: %[[TDESC_LD:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> ->
+ // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>>
+ // CHECK: %[[TDESC_ST:.*]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf32> ->
+ // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>>
+
+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC_LD]][0, 0] <{layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>}>
+ // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>} :
+ // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>> -> vector<256x128xf32>
+
+ // CHECK: %[[TRANSPOSED:.*]] = vector.transpose %2, [1, 0]
+ // CHECK-SAME {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
+
+ // CHECK: xegpu.store_nd %[[TRANSPOSED]], %[[TDESC_ST]][0, 0]
+ // CHECK-SAME: <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>}> : vector<128x256xf32>,
+ // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32>
+ %tdesc1 = xegpu.create_nd_tdesc %src1 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32>
+ %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32>
+ %trans = vector.transpose %load, [1, 0] : vector<256x128xf32> to vector<128x256xf32>
+ xegpu.store_nd %trans, %tdesc1[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>}
+ : vector<128x256xf32>, !xegpu.tensor_desc<128x256xf32>
+ return
+ }
+}
>From bbd4c2e320900e8c9a540e63033d6f1a2264f550 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 16 Dec 2025 16:15:40 +0000
Subject: [PATCH 02/10] Rename subgroup prop option
---
.../Dialect/XeGPU/propagate-layout-sg.mlir | 53 -------------------
1 file changed, 53 deletions(-)
delete mode 100644 mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
deleted file mode 100644
index 5659e9995b22a..0000000000000
--- a/mlir/test/Dialect/XeGPU/propagate-layout-sg.mlir
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=sg" -split-input-file %s | FileCheck %s
-
-gpu.module @test {
- // CHECK-LABEL: store_nd
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- func.func @store_nd(%src: memref<256x128xf32>) {
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
- // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}>
- // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
- // CHECK-SAME: : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
- // CHECK-SAME: -> vector<256x128xf32>
- // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}>
- // CHECK-SAME: : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
- %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32>
- %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
- return
- }
-}
-
-// -----
-
-gpu.module @test {
- // CHECK-LABEL: vector_transpose
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- // CHECK-SAME: %[[ARG_1:.*]]: memref<128x256xf32>
- func.func @vector_transpose(%src: memref<256x128xf32>, %src1: memref<128x256xf32>) {
- // CHECK: %[[TDESC_LD:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> ->
- // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>>
- // CHECK: %[[TDESC_ST:.*]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf32> ->
- // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>>
-
- // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC_LD]][0, 0] <{layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>}>
- // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>} :
- // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], order = [0, 1]>> -> vector<256x128xf32>
-
- // CHECK: %[[TRANSPOSED:.*]] = vector.transpose %2, [1, 0]
- // CHECK-SAME {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
-
- // CHECK: xegpu.store_nd %[[TRANSPOSED]], %[[TDESC_ST]][0, 0]
- // CHECK-SAME: <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>}> : vector<128x256xf32>,
- // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>>
- %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32>
- %tdesc1 = xegpu.create_nd_tdesc %src1 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32>
- %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32>
- %trans = vector.transpose %load, [1, 0] : vector<256x128xf32> to vector<128x256xf32>
- xegpu.store_nd %trans, %tdesc1[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], order = [1, 0]>}
- : vector<128x256xf32>, !xegpu.tensor_desc<128x256xf32>
- return
- }
-}
>From dd1dde36078dd2deb688d7dc3062aa6f24a100fc Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Mon, 8 Dec 2025 16:33:23 +0000
Subject: [PATCH 03/10] [MLIR][XeGPU] Add 2D `vector.multi_reduction`
optimization
---
.../Transforms/XeGPUOptimizeBlockLoads.cpp | 132 +++++++++++++++++-
.../Dialect/XeGPU/optimize-2d-reduction.mlir | 85 +++++++++++
2 files changed, 216 insertions(+), 1 deletion(-)
create mode 100644 mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index bb80df197d45b..b17b204a07dbc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -416,12 +416,131 @@ class VectorExtractOpPattern final
}
};
+class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
+ using OpConversionPattern::OpConversionPattern;
+ LogicalResult
+ matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (reductionOp.getReductionDims().size() != 2)
+ return rewriter.notifyMatchFailure(reductionOp,
+ "Expected 2D multi reduction");
+
+ auto layout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
+
+ auto dims = llvm::to_vector(reductionOp.getReductionDims());
+ auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, layout);
+ // Order does not matter
+ if (intraLaneDim == -1 || crossLaneDim == -1) {
+ intraLaneDim = dims[0];
+ crossLaneDim = dims[1];
+ }
+ auto loc = reductionOp.getLoc();
+ // XeGPU transforms expect vector types
+ auto sourceVecType = reductionOp.getSourceVectorType();
+ auto acc = reductionOp.getAcc();
+ bool scalarAcc = !isa<VectorType>(acc.getType());
+ if (scalarAcc)
+ acc = vector::FromElementsOp::create(
+ rewriter, loc, VectorType::get({1}, sourceVecType.getElementType()),
+ acc);
+
+ // Preserve layout in the intermediate reduction (apart from the reduced
+ // dim)
+ auto sourceSliceLayoutAttr = cast<xegpu::SliceAttr>(layout);
+ SmallVector<int64_t> sliceDims{
+ sourceSliceLayoutAttr.getDims().asArrayRef()};
+ auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), crossLaneDim);
+ assert(foundIt != sliceDims.end() &&
+ "Expected to find reduction dim in slice dims");
+ sliceDims.erase(foundIt);
+ auto intraLaneLayout = xegpu::SliceAttr::get(
+ reductionOp.getContext(), sourceSliceLayoutAttr.getParent(),
+ DenseI64ArrayAttr::get(getContext(), sliceDims));
+
+ // First we do intra-lane reduction
+ SmallVector<int64_t> accShape(sourceVecType.getShape());
+ accShape.erase(accShape.begin() + intraLaneDim);
+ // Add a dim to the lower-dim user-supplied acc
+ Value firstRedAcc = acc;
+ if (firstRedAcc) {
+ firstRedAcc = vector::BroadcastOp::create(
+ rewriter, loc,
+ VectorType::get(accShape, sourceVecType.getElementType()), acc);
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(firstRedAcc),
+ cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+ }
+ Value intraLaneReduced = vector::MultiDimReductionOp::create(
+ rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
+ firstRedAcc, ArrayRef<int64_t>(intraLaneDim));
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(intraLaneReduced),
+ cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+
+ // For scalar results, add a unit dim where intra lane dim was
+ if (scalarAcc) {
+ SmallVector<int64_t> vecTypeWithUnitDim{sourceVecType.getShape()};
+ vecTypeWithUnitDim[intraLaneDim] = 1;
+ intraLaneReduced = vector::ShapeCastOp::create(
+ rewriter, loc,
+ VectorType::get(vecTypeWithUnitDim, sourceVecType.getElementType()),
+ intraLaneReduced);
+ // Layout matches last reduction
+ xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(intraLaneReduced),
+ layout);
+ } else
+ crossLaneDim -= static_cast<int64_t>(intraLaneDim < crossLaneDim);
+ // Do cross-lane reduction
+ Value crossLaneReduced = vector::MultiDimReductionOp::create(
+ rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
+ ArrayRef<int64_t>(crossLaneDim));
+ xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(crossLaneReduced),
+ layout);
+
+ if (scalarAcc)
+ crossLaneReduced =
+ vector::ExtractOp::create(rewriter, loc, crossLaneReduced, 0);
+ assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
+ "Type mismatch");
+ rewriter.replaceOp(reductionOp, crossLaneReduced);
+ return success();
+ }
+
+private:
+ std::pair<int64_t, int64_t>
+ getReductionDimOrder(ArrayRef<int64_t> reductionDims,
+ xegpu::DistributeLayoutAttr layout) const {
+ assert(layout.isForSubgroup() && "Must know the lane layout");
+ assert(reductionDims.size() == 2 && "Expected 2D reduction");
+ int64_t intra, cross = -1;
+ xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
+ if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout)) {
+ while (dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent()))
+ layoutSliceAttr =
+ dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent());
+ layoutAttr = dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.getParent());
+ }
+ assert(layoutAttr);
+ SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
+
+ assert(laneLayout.size() && "Expected a non-empty layout");
+ // try to pick a dim that does not communicate
+ for (auto dim : reductionDims) {
+ if (laneLayout[dim] == 1)
+ intra = dim;
+ else
+ cross = dim;
+ }
+ return {intra, cross};
+ }
+};
+
} // namespace
void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
RewritePatternSet &patterns) {
patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
- VectorExtractOpPattern>(patterns.getContext());
+ VectorExtractOpPattern, MultiRed2dOp>(patterns.getContext());
}
namespace {
@@ -473,6 +592,17 @@ struct XeGPUOptimizeBlockLoadsPass final
auto laneData = layout.getEffectiveLaneDataAsInt();
return !canBeOptimizedForTranspose(laneLayout, laneData);
});
+
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [=](Operation *op) -> bool {
+ auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+ if (!layout || !layout.isForSubgroup())
+ return true;
+ if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
+ return reductionOp.getReductionDims().size() != 2;
+ return true;
+ });
+
converter.addConversion([](Type type) { return type; });
target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
new file mode 100644
index 0000000000000..754825193a10f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
+// RUN: --xegpu-optimize-block-loads --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
+// CHECK: %[[ACC_VEC:.*]] = vector.from_elements %[[ACC]] : vector<1xf32>
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.shape_cast %[[LOADED_REDUCED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<16xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED_FOR_CROSS]], %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [1] : vector<1x16xf32> to vector<1xf32>
+// CHECK: %[[SCALAR_RES:.*]] = vector.extract %[[LOADED_REDUCED_2D]][0] : f32 from vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.0 : f32
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+ -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ %load = xegpu.load_nd %tdesc[0, 0]
+ : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ -> vector<4x16xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [0, 1]
+ : vector<4x16xf32> to f32
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} : vector<1xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} [1] : vector<1x4x16xf32> to vector<1x16xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1] : vector<1x16xf32> to vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+ -> !xegpu.tensor_desc<4x16xf32>
+ %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+ %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32>
+ %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1,1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1, 2]
+ : vector<1x4x16xf32> to vector<1xf32>
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x64xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x64xf32> -> !xegpu.tensor_desc<1x64xf32>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
+// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
+// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} : vector<1xf32> to vector<1x64xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} [1] : vector<1x1x64xf32> to vector<1x64xf32>
+// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1] : vector<1x64xf32> to vector<1xf32>
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x64xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x64xf32>
+ -> !xegpu.tensor_desc<1x64xf32>
+ %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
+ %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
+ %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2]
+ : vector<1x1x64xf32> to vector<1xf32>
+ gpu.return
+ }
+}
>From 86579d59039b048a0d38e8ca4f33128c139760aa Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Thu, 11 Dec 2025 09:06:39 +0000
Subject: [PATCH 04/10] Add layout handling, disallow leading unit dims/slices
for now
---
.../Transforms/XeGPUOptimizeBlockLoads.cpp | 64 +++++++++++-----
.../Transforms/XeGPUSubgroupDistribute.cpp | 15 ++--
.../Dialect/XeGPU/optimize-2d-reduction.mlir | 73 ++++---------------
.../Dialect/XeGPU/subgroup-distribute.mlir | 47 +++++++++++-
4 files changed, 113 insertions(+), 86 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index b17b204a07dbc..5b27b008625e2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -424,40 +424,50 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
if (reductionOp.getReductionDims().size() != 2)
return rewriter.notifyMatchFailure(reductionOp,
"Expected 2D multi reduction");
+ // Retrieve layouts.
+ auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
+ auto srcLayout = xegpu::getDistributeLayoutAttr(reductionOp.getSource());
+ assert(isa<xegpu::LayoutAttr>(srcLayout) &&
+ "Currently we do not support sliced inputs");
- auto layout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
-
+ // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
auto dims = llvm::to_vector(reductionOp.getReductionDims());
- auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, layout);
+ auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
// Order does not matter
if (intraLaneDim == -1 || crossLaneDim == -1) {
intraLaneDim = dims[0];
crossLaneDim = dims[1];
}
auto loc = reductionOp.getLoc();
- // XeGPU transforms expect vector types
auto sourceVecType = reductionOp.getSourceVectorType();
auto acc = reductionOp.getAcc();
+ // If the accumulator is scalar, convert to 1-element vector and assign the
+ // result layout
bool scalarAcc = !isa<VectorType>(acc.getType());
- if (scalarAcc)
+ // TODO: remove scalar acc assumption (need more complex layout adjustments
+ // for sliced inputs).
+ assert(scalarAcc && "Expected scalar acc");
+ if (scalarAcc) {
acc = vector::FromElementsOp::create(
rewriter, loc, VectorType::get({1}, sourceVecType.getElementType()),
acc);
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(acc),
+ cast<xegpu::DistributeLayoutAttr>(resLayout));
+ }
- // Preserve layout in the intermediate reduction (apart from the reduced
- // dim)
- auto sourceSliceLayoutAttr = cast<xegpu::SliceAttr>(layout);
- SmallVector<int64_t> sliceDims{
- sourceSliceLayoutAttr.getDims().asArrayRef()};
+ // The first reduction's dist attribute does not have the cross lane dim.
+ auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
+ SmallVector<int64_t> sliceDims{resSliceLayoutAttr.getDims().asArrayRef()};
auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), crossLaneDim);
assert(foundIt != sliceDims.end() &&
"Expected to find reduction dim in slice dims");
sliceDims.erase(foundIt);
- auto intraLaneLayout = xegpu::SliceAttr::get(
- reductionOp.getContext(), sourceSliceLayoutAttr.getParent(),
+ auto intraLaneRedResLayout = xegpu::SliceAttr::get(
+ reductionOp.getContext(), resSliceLayoutAttr.getParent(),
DenseI64ArrayAttr::get(getContext(), sliceDims));
- // First we do intra-lane reduction
+ // We reduce only one dim first, adjsut accumulator.
SmallVector<int64_t> accShape(sourceVecType.getShape());
accShape.erase(accShape.begin() + intraLaneDim);
// Add a dim to the lower-dim user-supplied acc
@@ -468,34 +478,50 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
VectorType::get(accShape, sourceVecType.getElementType()), acc);
xegpu::setDistributeLayoutAttr(
llvm::dyn_cast<OpResult>(firstRedAcc),
- cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+ cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
}
Value intraLaneReduced = vector::MultiDimReductionOp::create(
rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
firstRedAcc, ArrayRef<int64_t>(intraLaneDim));
xegpu::setDistributeLayoutAttr(
llvm::dyn_cast<OpResult>(intraLaneReduced),
- cast<xegpu::DistributeLayoutAttr>(intraLaneLayout));
+ cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
- // For scalar results, add a unit dim where intra lane dim was
+ xegpu::DistributeLayoutAttr nextMultiRedLayout = intraLaneRedResLayout;
+ // Example: vector<2x4> got reduced to vector<2>, next reduction returns a
+ // scalar, distribution passes do not support this result type. Expand to
+ // vector<2x1>, so that the second reduction result is vector<1>. Restore
+ // this dim in layout, but lane data is 1.
if (scalarAcc) {
+ SmallVector<int> srcLaneData(srcLayout.getRank(), 1);
+ auto laneLayoutSrc = srcLayout.getEffectiveLaneLayoutAsInt();
+ SmallVector<int> srcLaneLayout(laneLayoutSrc.begin(),
+ laneLayoutSrc.end());
+ nextMultiRedLayout = xegpu::LayoutAttr::get(
+ reductionOp.getContext(),
+ DenseI32ArrayAttr::get(reductionOp.getContext(), srcLaneLayout),
+ DenseI32ArrayAttr::get(reductionOp.getContext(), srcLaneData));
+
SmallVector<int64_t> vecTypeWithUnitDim{sourceVecType.getShape()};
vecTypeWithUnitDim[intraLaneDim] = 1;
intraLaneReduced = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get(vecTypeWithUnitDim, sourceVecType.getElementType()),
intraLaneReduced);
- // Layout matches last reduction
xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(intraLaneReduced),
- layout);
+ nextMultiRedLayout);
} else
crossLaneDim -= static_cast<int64_t>(intraLaneDim < crossLaneDim);
// Do cross-lane reduction
+ // TODO: why use accumulator again?
Value crossLaneReduced = vector::MultiDimReductionOp::create(
rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
ArrayRef<int64_t>(crossLaneDim));
+ auto crossLaneLayout = xegpu::SliceAttr::get(
+ reductionOp.getContext(), nextMultiRedLayout,
+ DenseI64ArrayAttr::get(getContext(), crossLaneDim));
xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(crossLaneReduced),
- layout);
+ crossLaneLayout);
if (scalarAcc)
crossLaneReduced =
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a1c0656d0bdb5..9113f00ac39f0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -875,12 +875,15 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
std::string layoutMaskName =
xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3));
- xegpu::LayoutAttr layoutPayload =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
- xegpu::LayoutAttr layoutOffsets =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
- xegpu::LayoutAttr layoutMask =
- storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
+ xegpu::DistributeLayoutAttr layoutPayload =
+ storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+ layoutPayloadName);
+ xegpu::DistributeLayoutAttr layoutOffsets =
+ storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+ layoutOffsetsName);
+ xegpu::DistributeLayoutAttr layoutMask =
+ storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+ layoutMaskName);
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
index 754825193a10f..6d69b351bb5d9 100644
--- a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
+++ b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
@@ -2,84 +2,39 @@
// RUN: --xegpu-optimize-block-loads --split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @vector_reduce_2d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) {
// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
-// CHECK: %[[ACC_VEC:.*]] = vector.from_elements %[[ACC]] : vector<1xf32>
+// CHECK: %[[ACC_VEC:.*]] = vector.from_elements %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<1xf32>
// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC_VEC]]
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC_FOR_INTRA]]
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.shape_cast %[[LOADED_REDUCED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<16xf32> to vector<1x16xf32>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED_FOR_CROSS]], %[[ACC_VEC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [1] : vector<1x16xf32> to vector<1xf32>
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
// CHECK: %[[SCALAR_RES:.*]] = vector.extract %[[LOADED_REDUCED_2D]][0] : f32 from vector<1xf32>
gpu.module @xevm_test {
- gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.0 : f32
%tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
-> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
%load = xegpu.load_nd %tdesc[0, 0]
: !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
-> vector<4x16xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} [0, 1]
- : vector<4x16xf32> to f32
- gpu.return
- }
-}
+ %reduce = vector.multi_reduction <add>, %load, %cst
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>}
+ [0, 1] : vector<4x16xf32> to f32
+ %reduce_bcast = vector.broadcast %reduce
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}
+ : f32 to vector<16xf32>
-// -----
-// CHECK-LABEL: gpu.func @vector_reduce_2d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>) {
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
-// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
-// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32
-// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} : vector<1xf32> to vector<1x16xf32>
-// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1]>} [1] : vector<1x4x16xf32> to vector<1x16xf32>
-// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1] : vector<1x16xf32> to vector<1xf32>
-gpu.module @xevm_test {
- gpu.func @vector_reduce_2d(%src: memref<4x16xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
- %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
- -> !xegpu.tensor_desc<4x16xf32>
- %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>} : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
- %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 4, 1]>} : vector<4x16xf32> to vector<1x4x16xf32>
- %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1,1, 16], lane_data = [1, 4, 1]>, dims = [1, 2]>} [1, 2]
- : vector<1x4x16xf32> to vector<1xf32>
- gpu.return
- }
-}
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1> : vector<16xi1>
-// -----
-// CHECK-LABEL: gpu.func @vector_reduce_2d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x64xf32>) {
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.000000e+00> : vector<1xf32>
-// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x64xf32> -> !xegpu.tensor_desc<1x64xf32>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
-// CHECK: %[[LOADED_LEADING_UNIT:.*]] = vector.shape_cast %[[LOADED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
-// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} : vector<1xf32> to vector<1x64xf32>
-// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED_LEADING_UNIT]], %[[ACC_VEC_FOR_INTRA]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1]>} [1] : vector<1x1x64xf32> to vector<1x64xf32>
-// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED]], %[[ACC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1] : vector<1x64xf32> to vector<1xf32>
-gpu.module @xevm_test {
- gpu.func @vector_reduce_2d(%src: memref<4x64xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [0, 1]>} dense<1.0> : vector<1xf32>
- %tdesc = xegpu.create_nd_tdesc %src : memref<4x64xf32>
- -> !xegpu.tensor_desc<1x64xf32>
- %load = xegpu.load_nd %tdesc[0, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 4]>} : !xegpu.tensor_desc<1x64xf32> -> vector<1x64xf32>
- %load_with_dim = vector.shape_cast %load {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>} : vector<1x64xf32> to vector<1x1x64xf32>
- %reduce = vector.multi_reduction <add>, %load_with_dim, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2]
- : vector<1x1x64xf32> to vector<1xf32>
+ xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 87c67ba6bf324..68c43c9a7a48a 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -369,7 +369,7 @@ gpu.module @xevm_module{
%2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
// CHECK-NOT: vector.broadcast
// CHECK-NOT: vector.shape_cast
-
+
%tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
@@ -397,4 +397,47 @@ gpu.module @xevm_module{
}
}
-
+// -----
+gpu.module @xevm_test {
+ // CHECK-LABEL: gpu.func @vector_reduce_2d
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
+ // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
+ // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
+ // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
+ // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+ // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
+ // CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+ // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] <{packed}> : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
+ // CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction <add>, %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32
+ // CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32
+ // CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32
+ // CHECK: %[[SHUFFLE_1:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_0]], %[[C2]], %[[C16]] : f32
+ // CHECK: %[[VEC_RED_1:.*]] = arith.addf %[[VEC_RED_0]], %[[SHUFFLE_1]] : f32
+ // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_1]], %[[C4]], %[[C16]] : f32
+ // CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32
+ // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
+ // CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
+ // CHECK: %[[VEC_RED_4:.*]] = arith.addf %[[VEC_RED_3]], %[[CST_1]] : f32
+ // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_4]] : f32 to vector<1xf32>
+ // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]]
+ // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}> : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
+ gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
+ %2 = vector.from_elements %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<1xf32>
+ %3 = vector.broadcast %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
+ %4 = vector.multi_reduction <add>, %1, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+ %5 = vector.shape_cast %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
+ %6 = vector.multi_reduction <add>, %5, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
+ %7 = vector.extract %6[0] : f32 from vector<1xf32>
+ %8 = vector.broadcast %7 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : f32 to vector<16xf32>
+ %cst_0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
+ %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+ xegpu.store %8, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+ gpu.return
+ }
+}
>From 5b444f5cf7fc2971ebc442c2f4ef3764828d6719 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Mon, 15 Dec 2025 17:00:44 +0000
Subject: [PATCH 05/10] Simplify 2d optimization
---
.../Transforms/XeGPUOptimizeBlockLoads.cpp | 54 +++----------------
.../Dialect/XeGPU/optimize-2d-reduction.mlir | 13 ++---
2 files changed, 10 insertions(+), 57 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index 5b27b008625e2..1f3ee0540405a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -447,14 +447,6 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
// TODO: remove scalar acc assumption (need more complex layout adjustments
// for sliced inputs).
assert(scalarAcc && "Expected scalar acc");
- if (scalarAcc) {
- acc = vector::FromElementsOp::create(
- rewriter, loc, VectorType::get({1}, sourceVecType.getElementType()),
- acc);
- xegpu::setDistributeLayoutAttr(
- llvm::dyn_cast<OpResult>(acc),
- cast<xegpu::DistributeLayoutAttr>(resLayout));
- }
// The first reduction's dist attribute does not have the cross lane dim.
auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
@@ -467,7 +459,7 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
reductionOp.getContext(), resSliceLayoutAttr.getParent(),
DenseI64ArrayAttr::get(getContext(), sliceDims));
- // We reduce only one dim first, adjsut accumulator.
+ // We reduce intra-lane, acc is source without intra lane.
SmallVector<int64_t> accShape(sourceVecType.getShape());
accShape.erase(accShape.begin() + intraLaneDim);
// Add a dim to the lower-dim user-supplied acc
@@ -487,45 +479,11 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
llvm::dyn_cast<OpResult>(intraLaneReduced),
cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
- xegpu::DistributeLayoutAttr nextMultiRedLayout = intraLaneRedResLayout;
- // Example: vector<2x4> got reduced to vector<2>, next reduction returns a
- // scalar, distribution passes do not support this result type. Expand to
- // vector<2x1>, so that the second reduction result is vector<1>. Restore
- // this dim in layout, but lane data is 1.
- if (scalarAcc) {
- SmallVector<int> srcLaneData(srcLayout.getRank(), 1);
- auto laneLayoutSrc = srcLayout.getEffectiveLaneLayoutAsInt();
- SmallVector<int> srcLaneLayout(laneLayoutSrc.begin(),
- laneLayoutSrc.end());
- nextMultiRedLayout = xegpu::LayoutAttr::get(
- reductionOp.getContext(),
- DenseI32ArrayAttr::get(reductionOp.getContext(), srcLaneLayout),
- DenseI32ArrayAttr::get(reductionOp.getContext(), srcLaneData));
-
- SmallVector<int64_t> vecTypeWithUnitDim{sourceVecType.getShape()};
- vecTypeWithUnitDim[intraLaneDim] = 1;
- intraLaneReduced = vector::ShapeCastOp::create(
- rewriter, loc,
- VectorType::get(vecTypeWithUnitDim, sourceVecType.getElementType()),
- intraLaneReduced);
- xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(intraLaneReduced),
- nextMultiRedLayout);
- } else
- crossLaneDim -= static_cast<int64_t>(intraLaneDim < crossLaneDim);
- // Do cross-lane reduction
- // TODO: why use accumulator again?
- Value crossLaneReduced = vector::MultiDimReductionOp::create(
- rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
- ArrayRef<int64_t>(crossLaneDim));
- auto crossLaneLayout = xegpu::SliceAttr::get(
- reductionOp.getContext(), nextMultiRedLayout,
- DenseI64ArrayAttr::get(getContext(), crossLaneDim));
- xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(crossLaneReduced),
- crossLaneLayout);
-
- if (scalarAcc)
- crossLaneReduced =
- vector::ExtractOp::create(rewriter, loc, crossLaneReduced, 0);
+ Value crossLaneReduced = vector::ReductionOp::create(
+ rewriter, loc, reductionOp.getKind(), intraLaneReduced, nullptr);
+ xegpu::setDistributeLayoutAttr(
+ llvm::dyn_cast<OpResult>(crossLaneReduced),
+ cast<xegpu::DistributeLayoutAttr>(resLayout));
assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
"Type mismatch");
rewriter.replaceOp(reductionOp, crossLaneReduced);
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
index 6d69b351bb5d9..dcd9ff1154dcc 100644
--- a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
+++ b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
@@ -6,16 +6,11 @@
// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
-// CHECK: %[[ACC_VEC:.*]] = vector.from_elements %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<1xf32>
-// CHECK: %[[ACC_VEC_FOR_INTRA:.*]] = vector.broadcast %[[ACC_VEC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
-// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC_FOR_INTRA]]
+// CHECK: %[[ACC_VEC:.*]] = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : f32 to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC]]
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
-// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.shape_cast %[[LOADED_REDUCED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
-// CHECK: %[[LOADED_REDUCED_2D:.*]] = vector.multi_reduction <add>, %[[LOADED_REDUCED_FOR_CROSS]], %[[ACC_VEC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
-// CHECK: %[[SCALAR_RES:.*]] = vector.extract %[[LOADED_REDUCED_2D]][0] : f32 from vector<1xf32>
+// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction <add>, %[[LOADED_REDUCED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
gpu.module @xevm_test {
gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.0 : f32
>From 329083f4b67294393020cc7bcc34ba7c9a2233e4 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 16 Dec 2025 10:25:41 +0000
Subject: [PATCH 06/10] Add peephole to master pass
---
mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 38313dc3c01d5..269acc4382eb4 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -77,6 +77,7 @@ void buildGPUPassPipeline(OpPassManager &pm,
layoutOptions.layoutKind = "lane";
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(layoutOptions));
+ pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUOptimizeBlockLoads());
pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
>From cd246033b9bfa52b1f17e7b76e0f01dc6b8bdd23 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 16 Dec 2025 14:46:43 +0000
Subject: [PATCH 07/10] Cleanup
---
.../Transforms/XeGPUOptimizeBlockLoads.cpp | 33 ++++++-------------
.../Dialect/XeGPU/optimize-2d-reduction.mlir | 24 +++++++-------
.../Dialect/XeGPU/subgroup-distribute.mlir | 30 ++++++++---------
3 files changed, 35 insertions(+), 52 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index 1f3ee0540405a..02e91f48b1ff2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -421,15 +421,12 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- if (reductionOp.getReductionDims().size() != 2)
- return rewriter.notifyMatchFailure(reductionOp,
- "Expected 2D multi reduction");
- // Retrieve layouts.
+ auto sourceVecType = reductionOp.getSourceVectorType();
+ if (reductionOp.getReductionDims().size() != 2 ||
+ sourceVecType.getRank() != 2)
+ return rewriter.notifyMatchFailure(
+ reductionOp, "Expected 2D multi reduction of a 2D source");
auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
- auto srcLayout = xegpu::getDistributeLayoutAttr(reductionOp.getSource());
- assert(isa<xegpu::LayoutAttr>(srcLayout) &&
- "Currently we do not support sliced inputs");
-
// Retrieve and order dims for 1D decomposition (prefer intra-lane first).
auto dims = llvm::to_vector(reductionOp.getReductionDims());
auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
@@ -439,14 +436,7 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
crossLaneDim = dims[1];
}
auto loc = reductionOp.getLoc();
- auto sourceVecType = reductionOp.getSourceVectorType();
auto acc = reductionOp.getAcc();
- // If the accumulator is scalar, convert to 1-element vector and assign the
- // result layout
- bool scalarAcc = !isa<VectorType>(acc.getType());
- // TODO: remove scalar acc assumption (need more complex layout adjustments
- // for sliced inputs).
- assert(scalarAcc && "Expected scalar acc");
// The first reduction's dist attribute does not have the cross lane dim.
auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
@@ -459,22 +449,19 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
reductionOp.getContext(), resSliceLayoutAttr.getParent(),
DenseI64ArrayAttr::get(getContext(), sliceDims));
- // We reduce intra-lane, acc is source without intra lane.
SmallVector<int64_t> accShape(sourceVecType.getShape());
accShape.erase(accShape.begin() + intraLaneDim);
- // Add a dim to the lower-dim user-supplied acc
- Value firstRedAcc = acc;
- if (firstRedAcc) {
- firstRedAcc = vector::BroadcastOp::create(
+ if (acc) {
+ acc = vector::BroadcastOp::create(
rewriter, loc,
VectorType::get(accShape, sourceVecType.getElementType()), acc);
xegpu::setDistributeLayoutAttr(
- llvm::dyn_cast<OpResult>(firstRedAcc),
+ llvm::dyn_cast<OpResult>(acc),
cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
}
Value intraLaneReduced = vector::MultiDimReductionOp::create(
- rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
- firstRedAcc, ArrayRef<int64_t>(intraLaneDim));
+ rewriter, loc, reductionOp.getKind(), reductionOp.getSource(), acc,
+ ArrayRef<int64_t>(intraLaneDim));
xegpu::setDistributeLayoutAttr(
llvm::dyn_cast<OpResult>(intraLaneReduced),
cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
index dcd9ff1154dcc..d3a1c33ced0e9 100644
--- a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
+++ b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
@@ -3,33 +3,33 @@
// CHECK-LABEL: gpu.func @vector_reduce_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
-// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
-// CHECK: %[[ACC_VEC:.*]] = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : f32 to vector<16xf32>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+// CHECK: %[[ACC_VEC:.*]] = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction <add>, %[[LOADED_REDUCED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
gpu.module @xevm_test {
gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.0 : f32
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.0 : f32
%tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
- -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%load = xegpu.load_nd %tdesc[0, 0]
- : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
+ : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<4x16xf32>
%reduce = vector.multi_reduction <add>, %load, %cst
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>}
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}
[0, 1] : vector<4x16xf32> to f32
%reduce_bcast = vector.broadcast %reduce
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
: f32 to vector<16xf32>
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1> : vector<16xi1>
- xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+ xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 68c43c9a7a48a..b9a60c021e29b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -400,8 +400,6 @@ gpu.module @xevm_module{
// -----
gpu.module @xevm_test {
// CHECK-LABEL: gpu.func @vector_reduce_2d
- // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
- // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
@@ -409,8 +407,10 @@ gpu.module @xevm_test {
// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
// CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
- // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] <{packed}> : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
+ // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
// CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction <add>, %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32
// CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32
// CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32
@@ -420,24 +420,20 @@ gpu.module @xevm_test {
// CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32
// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
// CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
- // CHECK: %[[VEC_RED_4:.*]] = arith.addf %[[VEC_RED_3]], %[[CST_1]] : f32
- // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_4]] : f32 to vector<1xf32>
+ // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32>
// CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]]
- // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}> : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
+ // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
- %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>>
- %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>> -> vector<4x16xf32>
- %2 = vector.from_elements %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0, 1]>} : vector<1xf32>
- %3 = vector.broadcast %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : vector<1xf32> to vector<16xf32>
- %4 = vector.multi_reduction <add>, %1, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
- %5 = vector.shape_cast %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
- %6 = vector.multi_reduction <add>, %5, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
- %7 = vector.extract %6[0] : f32 from vector<1xf32>
- %8 = vector.broadcast %7 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>} : f32 to vector<16xf32>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+ %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
+ %3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+ %4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+ %5 = vector.broadcast %4 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
%cst_0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
%cst_1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
- xegpu.store %8, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+ xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
gpu.return
}
}
>From 4937e4f6288a00448bb59a93c0182c8371e7a520 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Wed, 17 Dec 2025 16:24:53 +0000
Subject: [PATCH 08/10] Add propagation
---
mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 269acc4382eb4..54c7b05eb4f7a 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -78,6 +78,8 @@ void buildGPUPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(layoutOptions));
pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUOptimizeBlockLoads());
+ pm.addNestedPass<gpu::GPUModuleOp>(
+ xegpu::createXeGPUPropagateLayout(layoutOptions));
pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
>From 04109c5fd2c06bc304b53cc661d69a17655c584a Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Thu, 18 Dec 2025 18:03:04 +0000
Subject: [PATCH 09/10] Feedback
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 15 ++++++-----
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 19 ++++++++++++++
.../Transforms/XeGPUOptimizeBlockLoads.cpp | 25 +++++++------------
.../Dialect/XeGPU/subgroup-distribute.mlir | 9 +++----
4 files changed, 41 insertions(+), 27 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 446f64fffa468..58f8152878093 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -520,9 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
/// Check if this is slice of some other layout.
bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
-
+
/// Check if this is identical to some other layout.
- bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
+ bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
}];
@@ -698,9 +698,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
/// Check if this is slice of some other layout.
bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-
+
+ /// Drop the slice dims to get the original layout.
+ SliceAttr dropSliceDims(ArrayRef<int64_t> dimsToDrop);
+
/// Check if this is identical to some other layout.
- bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
+ bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
}];
let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
@@ -782,13 +785,13 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
let methods = [
InterfaceMethod<
/*desc=*/"Get the anchor layout attribute.",
- /*retTy=*/"xegpu::DistributeLayoutAttr",
+ /*retTy=*/"xegpu::DistributeLayoutAttr",
/*methodName=*/"getAnchorLayout",
/*args=*/(ins)
>,
InterfaceMethod<
/*desc=*/"Set the anchor layout attribute.",
- /*retTy=*/"void",
+ /*retTy=*/"void",
/*methodName=*/"setAnchorLayout",
/*args=*/(ins "xegpu::DistributeLayoutAttr":$layout)
>,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ccf17da26c942..1a780dcedc2f5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -592,6 +592,25 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
[&](int64_t dim) { return thisDims.contains(dim); });
}
+xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> dimsToDrop) {
+ auto flattenedThis = flatten();
+ if (dimsToDrop.empty())
+ return flattenedThis;
+ SmallVector<int64_t> sliceDims{flattenedThis.getDims().asArrayRef()};
+ for (auto dim : dimsToDrop) {
+ auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), dim);
+ assert(foundIt != sliceDims.end() &&
+ "Expected to find the specified reduction dim in slice dims");
+ sliceDims.erase(foundIt);
+ }
+
+ auto sliceWithoutDims = xegpu::SliceAttr::get(
+ this->getContext(), flattenedThis.getParent(),
+ DenseI64ArrayAttr::get(this->getContext(), sliceDims));
+
+ return sliceWithoutDims;
+}
+
bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
if (dyn_cast<xegpu::LayoutAttr>(other))
return false;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index 02e91f48b1ff2..31fa282472ffc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -416,7 +416,8 @@ class VectorExtractOpPattern final
}
};
-class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
+class MultiRed2dOpPattern
+ : public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
@@ -440,14 +441,8 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
// The first reduction's dist attribute does not have the cross lane dim.
auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
- SmallVector<int64_t> sliceDims{resSliceLayoutAttr.getDims().asArrayRef()};
- auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), crossLaneDim);
- assert(foundIt != sliceDims.end() &&
- "Expected to find reduction dim in slice dims");
- sliceDims.erase(foundIt);
- auto intraLaneRedResLayout = xegpu::SliceAttr::get(
- reductionOp.getContext(), resSliceLayoutAttr.getParent(),
- DenseI64ArrayAttr::get(getContext(), sliceDims));
+ SmallVector<int64_t> dropDims{crossLaneDim};
+ auto intraLaneRedResLayout = resSliceLayoutAttr.dropSliceDims(dropDims);
SmallVector<int64_t> accShape(sourceVecType.getShape());
accShape.erase(accShape.begin() + intraLaneDim);
@@ -485,12 +480,9 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
assert(reductionDims.size() == 2 && "Expected 2D reduction");
int64_t intra, cross = -1;
xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
- if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout)) {
- while (dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent()))
- layoutSliceAttr =
- dyn_cast<xegpu::SliceAttr>(layoutSliceAttr.getParent());
- layoutAttr = dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.getParent());
- }
+ if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
+ layoutAttr =
+ dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
assert(layoutAttr);
SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
@@ -511,7 +503,8 @@ class MultiRed2dOp : public OpConversionPattern<vector::MultiDimReductionOp> {
void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
RewritePatternSet &patterns) {
patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
- VectorExtractOpPattern, MultiRed2dOp>(patterns.getContext());
+ VectorExtractOpPattern, MultiRed2dOpPattern>(
+ patterns.getContext());
}
namespace {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index b9a60c021e29b..dae00838fdcb6 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -208,7 +208,7 @@ gpu.module @xevm_module{
: vector<16x8xi32> to vector<16x16xf16>
%5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
: vector<16x16xf16> to vector<16x16xf16>
- %6 = xegpu.dpas %1, %5
+ %6 = xegpu.dpas %1, %5
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -364,7 +364,7 @@ gpu.module @xevm_module{
%c0 = arith.constant 0 : index
%mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
%1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
-
+
%11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
%2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
// CHECK-NOT: vector.broadcast
@@ -421,12 +421,11 @@ gpu.module @xevm_test {
// CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
// CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
// CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32>
- // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]]
- // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
+ // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
%0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+ %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
%2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
%3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
%4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
>From af6d3ae95069af2fc66e79a469ff35a43d37f89b Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Wed, 7 Jan 2026 13:53:36 +0000
Subject: [PATCH 10/10] Remove flattening, rename args, pass, consolidate tests
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
.../mlir/Dialect/XeGPU/Transforms/Passes.td | 2 +-
.../Dialect/XeGPU/Transforms/Transforms.h | 2 +-
.../GPU/Pipelines/GPUToXeVMPipeline.cpp | 2 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 13 ++++---
.../Dialect/XeGPU/Transforms/CMakeLists.txt | 2 +-
...ckLoads.cpp => XeGPUPeepHoleOptimizer.cpp} | 18 +++++-----
.../Dialect/XeGPU/optimize-2d-reduction.mlir | 35 ------------------
...-transpose.mlir => peephole-optimize.mlir} | 36 ++++++++++++++++++-
9 files changed, 55 insertions(+), 57 deletions(-)
rename mlir/lib/Dialect/XeGPU/Transforms/{XeGPUOptimizeBlockLoads.cpp => XeGPUPeepHoleOptimizer.cpp} (98%)
delete mode 100644 mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
rename mlir/test/Dialect/XeGPU/{optimize-transpose.mlir => peephole-optimize.mlir} (88%)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 58f8152878093..5e56854c254a0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -700,7 +700,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
/// Drop the slice dims to get the original layout.
- SliceAttr dropSliceDims(ArrayRef<int64_t> dimsToDrop);
+ SliceAttr dropSliceDims(ArrayRef<int64_t> sliceDimsToDrop);
/// Check if this is identical to some other layout.
bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3ff7805263f0e..e25adbd1673d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -102,7 +102,7 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> {
"scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"];
}
-def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
+def XeGPUPeepHoleOptimizer : Pass<"xegpu-optimize-peephole"> {
let summary = "Optimize XeGPU block load operations";
let description = [{
This pass rewrites XeGPU loadNd operations into more optimal forms
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1776a209d0bf1..5942f69b4a66d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -62,7 +62,7 @@ struct UnrollOptions {
/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
/// Appends patterns for optimizing block load operations into `patterns`.
-void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns);
+void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU SIMT distribution into `patterns`.
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
/// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 54c7b05eb4f7a..f7fff8e1fd4cf 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -77,7 +77,7 @@ void buildGPUPassPipeline(OpPassManager &pm,
layoutOptions.layoutKind = "lane";
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(layoutOptions));
- pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUOptimizeBlockLoads());
+ pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPeepHoleOptimizer());
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(layoutOptions));
pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 1a780dcedc2f5..53ca17f4f99bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -592,12 +592,11 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
[&](int64_t dim) { return thisDims.contains(dim); });
}
-xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> dimsToDrop) {
- auto flattenedThis = flatten();
- if (dimsToDrop.empty())
- return flattenedThis;
- SmallVector<int64_t> sliceDims{flattenedThis.getDims().asArrayRef()};
- for (auto dim : dimsToDrop) {
+xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> sliceDimsToDrop) {
+ if (sliceDimsToDrop.empty())
+ return *this;
+ SmallVector<int64_t> sliceDims{getDims().asArrayRef()};
+ for (auto dim : sliceDimsToDrop) {
auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), dim);
assert(foundIt != sliceDims.end() &&
"Expected to find the specified reduction dim in slice dims");
@@ -605,7 +604,7 @@ xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> dimsToDrop) {
}
auto sliceWithoutDims = xegpu::SliceAttr::get(
- this->getContext(), flattenedThis.getParent(),
+ this->getContext(), getParent(),
DenseI64ArrayAttr::get(this->getContext(), sliceDims));
return sliceWithoutDims;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 29b645feab2c6..15d31eadcb6df 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUWgToSgDistribute.cpp
XeGPUPropagateLayout.cpp
XeGPUVectorLinearize.cpp
- XeGPUOptimizeBlockLoads.cpp
+ XeGPUPeepHoleOptimizer.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
similarity index 98%
rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 31fa282472ffc..f16572d9cae8b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -1,4 +1,4 @@
-//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
+//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -30,12 +30,12 @@
namespace mlir {
namespace xegpu {
-#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
+#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir
-#define DEBUG_TYPE "xegpu-optimize-block-loads"
+#define DEBUG_TYPE "xegpu-optimize-peephole"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
using namespace mlir;
@@ -500,7 +500,7 @@ class MultiRed2dOpPattern
} // namespace
-void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
+void xegpu::populateXeGPUPeepHoleOptimizerPatterns(
RewritePatternSet &patterns) {
patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
VectorExtractOpPattern, MultiRed2dOpPattern>(
@@ -509,9 +509,9 @@ void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
namespace {
-struct XeGPUOptimizeBlockLoadsPass final
- : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
- XeGPUOptimizeBlockLoadsPass> {
+struct XeGPUPeepHoleOptimizerPass final
+ : public xegpu::impl::XeGPUPeepHoleOptimizerBase<
+ XeGPUPeepHoleOptimizerPass> {
void runOnOperation() override {
MLIRContext &context = getContext();
TypeConverter converter;
@@ -528,7 +528,7 @@ struct XeGPUOptimizeBlockLoadsPass final
});
if (!isTargetSupported) {
- DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
+ DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
<< "\n";
return;
}
@@ -573,7 +573,7 @@ struct XeGPUOptimizeBlockLoadsPass final
vector::VectorDialect>();
scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
target);
- xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
+ xegpu::populateXeGPUPeepHoleOptimizerPatterns(patterns);
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns)))) {
DBGS() << "Optimize block loads pass failed.\n";
diff --git a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir b/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
deleted file mode 100644
index d3a1c33ced0e9..0000000000000
--- a/mlir/test/Dialect/XeGPU/optimize-2d-reduction.mlir
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: --xegpu-optimize-block-loads --split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: gpu.func @vector_reduce_2d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
-// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
-// CHECK: %[[ACC_VEC:.*]] = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
-// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
-// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction <add>, %[[LOADED_REDUCED]]
-// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
-gpu.module @xevm_test {
- gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.0 : f32
- %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
- -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
- : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<4x16xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}
- [0, 1] : vector<4x16xf32> to f32
- %reduce_bcast = vector.broadcast %reduce
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
- : f32 to vector<16xf32>
-
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1> : vector<16xi1>
-
- xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
- gpu.return
- }
-}
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
similarity index 88%
rename from mlir/test/Dialect/XeGPU/optimize-transpose.mlir
rename to mlir/test/Dialect/XeGPU/peephole-optimize.mlir
index c748c1ca5ef88..0d749ae60df57 100644
--- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
+++ b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s
+// RUN: --xegpu-optimize-peephole --canonicalize --cse --split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @no_scf(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> {
@@ -278,3 +278,37 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
gpu.return
}
}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+// CHECK: %[[ACC_VEC:.*]] = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction <add>, %[[LOADED_REDUCED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+gpu.module @xevm_test {
+ gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.0 : f32
+ %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+ -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %load = xegpu.load_nd %tdesc[0, 0]
+ : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> vector<4x16xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}
+ [0, 1] : vector<4x16xf32> to f32
+ %reduce_bcast = vector.broadcast %reduce
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+ : f32 to vector<16xf32>
+
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1> : vector<16xi1>
+
+ xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+ gpu.return
+ }
+}
More information about the Mlir-commits
mailing list