[Mlir-commits] [mlir] [MLIR][XeGPU] Add blocking and subgroup to lane distribution support for ConvertLayout operation (PR #183837)
Jianhui Li
llvmlistbot at llvm.org
Sat Feb 28 07:43:44 PST 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/183837
>From 364f3ef11ebfc00c243b391accf013a83e91aaf9 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Feb 2026 21:22:35 +0000
Subject: [PATCH 1/3] add blocking and subgroup to lane distribution support
for ConvertLayout op
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 1 -
.../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 2 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 6 --
.../XeGPU/Transforms/XeGPUBlocking.cpp | 38 ++++--------
.../Transforms/XeGPUSubgroupDistribute.cpp | 25 +++++++-
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 58 ++++++++++++++++---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 2 +-
.../XeGPU/subgroup-distribute-unit.mlir | 16 +++++
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 45 ++++++++++----
9 files changed, 139 insertions(+), 54 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6d21aa9295716..f8a7c3b0f76a3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1539,7 +1539,6 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
}];
- let hasFolder = 1;
let hasVerifier = 1;
let hasCanonicalizer = 1;
}
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 8c92b7e2b718b..87dca361e81e1 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -345,7 +345,7 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
LogicalResult
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const final {
- if (!hasValidWidth(op))
+ if (getSubgroupSize(op) && !hasValidWidth(op))
return rewriter.notifyMatchFailure(
op, "shuffle width and subgroup size mismatch");
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3aba0f5070764..5dafee8b66b80 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -1113,12 +1113,6 @@ LogicalResult ConvertLayoutOp::verify() {
return mlir::success();
}
-OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
- if (getInputLayout() == getTargetLayout())
- return getSource();
- return {};
-}
-
struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 206f52a6c71cc..fb29756e85e5f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -77,30 +77,6 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
-// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
-// layout attributes. Since both producer and consumer operations handle data
-// partitioning based on their own inst_data, while maintaining original input
-// and output shape, ConvertLayoutOp does not need to manage inst_data.
-struct ConvertLayoutOpPattern
- : public OpRewritePattern<xegpu::ConvertLayoutOp> {
- using OpRewritePattern::OpRewritePattern;
- LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
- PatternRewriter &rewriter) const override {
- xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
- xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
- if (inputLayout.getEffectiveInstDataAsInt().empty() ||
- targetLayout.getEffectiveInstDataAsInt().empty())
- return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
-
- inputLayout = inputLayout.dropInstData();
- targetLayout = targetLayout.dropInstData();
- auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
- op.getLoc(), op.getType(), op.getSource(), inputLayout, targetLayout);
- rewriter.replaceOp(op, newOp);
- return success();
- }
-};
-
//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
@@ -177,6 +153,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
return getTileShape(loadGatherOp->getOpOperand(0));
}
+ if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
+ auto inputInstData =
+ convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt();
+ auto targetInstData =
+ convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt();
+ // return the one with larger size
+ if (computeProduct(inputInstData) >= computeProduct(targetInstData))
+ return inputInstData;
+ else
+ return targetInstData;
+ }
+
if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
return getTileShape(storeScatterOp.getOffsets()
? storeScatterOp->getOpOperand(0)
@@ -378,8 +366,6 @@ void XeGPUBlockingPass::runOnOperation() {
});
RewritePatternSet patterns(ctx);
- patterns.add<ConvertLayoutOpPattern>(ctx);
-
vector::UnrollVectorOptions vectorOptions;
vectorOptions.setNativeShapeFn(options.nativeShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f05036deabe41..4dcc25677fd63 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -2060,6 +2060,29 @@ struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
}
};
+struct ConvertLayoutDistribution
+ : public OpRewritePattern<xegpu::ConvertLayoutOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
+ auto inputLayout = op.getInputLayoutAttr();
+ auto targetLayout = op.getTargetLayoutAttr();
+
+ if (!inputLayout || !targetLayout)
+ return rewriter.notifyMatchFailure(op, "missing layout attributes");
+
+ if (!inputLayout.isCompatibleWith(targetLayout, xegpu::LayoutKind::Lane)) {
+ op.emitError()
+ << "incompatible convert_layout not supported: input_layout="
+ << inputLayout << ", target_layout=" << targetLayout;
+ return failure();
+ }
+ rewriter.replaceOp(op, op.getSource());
+ return success();
+ }
+};
+
} // namespace
namespace {
@@ -2077,7 +2100,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
GpuBarrierDistribution, VectorMultiReductionDistribution,
LoadDistribution, StoreDistribution, VectorTransposeDistribution,
VectorBitcastDistribution, LoadMatrixDistribution,
- StoreMatrixDistribution,
+ StoreMatrixDistribution, ConvertLayoutDistribution,
MemrefExtractAlignedPointerAsIndexDistribution>(
patterns.getContext(),
/*pattern benefit=*/PatternHierarchy::Regular);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 2b1bd4d73a576..cf15f373c46c7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -1032,15 +1032,59 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
}
};
+struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
+ using UnrollPattern<xegpu::ConvertLayoutOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+ assert(valueTy && "the value type must be vector type!");
+
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
+ return failure();
+
+ xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
+ xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
+ if (!inputLayout || !targetLayout)
+ return rewriter.notifyMatchFailure(op, "missing layout attributes.");
+
+ if (inputLayout.getEffectiveInstDataAsInt().empty() ||
+ targetLayout.getEffectiveInstDataAsInt().empty())
+ return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
+
+ inputLayout = inputLayout.dropInstData();
+ targetLayout = targetLayout.dropInstData();
+
+ SmallVector<Type> convertedValTypes =
+ getUnrolledTypes(valueTy, *targetShape);
+ SmallVector<Value> convertedValues =
+ pack(op.getOperand(), convertedValTypes, *targetShape, loc, rewriter);
+
+ Value newSource = op.getSource();
+ SmallVector<Value> newOps;
+ if (inputLayout && targetLayout) {
+ for (auto [v, t] : llvm::zip(convertedValues, convertedValTypes)) {
+ auto newOp = xegpu::ConvertLayoutOp::create(rewriter, op.getLoc(), t, v,
+ inputLayout, targetLayout);
+ newOps.push_back(newOp);
+ }
+ newSource = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+ }
+ rewriter.replaceOp(op, newSource);
+ return success();
+ }
+};
+
} // namespace
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
- patterns
- .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
- UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
- UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp,
- UnrollLoadGatherOpWithOffset, UnrollStoreScatterOpWithOffsets>(
- patterns.getContext(), options);
+ patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
+ UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
+ UnrollPrefetchOp, UnrollUpdateOffsetOp, UnrollLoadMatrixOp,
+ UnrollStoreMatrixOp, UnrollLoadGatherOpWithOffset,
+ UnrollStoreScatterOpWithOffsets, UnrollConvertLayoutOp>(
+ patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 3271e73e0b571..833a075cdd114 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -195,7 +195,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
}
}
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
- return convertOp.getInputLayoutAttr();
+ return convertOp.getTargetLayoutAttr();
}
auto layout = anchorOp.getAnchorLayout();
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 31bb6704eece9..dde58ba31860d 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1189,4 +1189,20 @@ gpu.func
gpu.return
}
+ // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible(
+ // CHECK: %[[R:.*]] = gpu.warp_execute_on_lane_0
+ // CHECK-NOT: xegpu.convert_layout
+ // CHECK: gpu.yield %{{.*}} : vector<16xf32>
+ gpu.func @convert_layout_removed_when_compatible(%laneid: index){
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+ %0 = "some_op"() : () -> vector<16xf32>
+ %1 = xegpu.convert_layout %0
+ <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}>
+ : vector<16xf32>
+ gpu.yield %1 : vector<16xf32>
+ }
+ "some_user_op"(%r) : (vector<1xf32>) -> ()
+ gpu.return
+ }
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index e80a9144b9674..b0b910a267d86 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -592,8 +592,8 @@ gpu.module @test_kernel {
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
%a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
%b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
- %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
- %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ %a1 = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
+ %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
%c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
gpu.return
@@ -602,19 +602,42 @@ gpu.module @test_kernel {
// -----
-#lb = #xegpu.layout<inst_data = [8, 32, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
-#b = #xegpu.layout<inst_data = [8, 16, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
+#in = #xegpu.slice<#xegpu.layout<inst_data = [1, 16]>, dims = [1]>
+#out = #xegpu.slice<#xegpu.layout<inst_data = [1, 16], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+gpu.module @test_kernel {
+ // CHECK-LABEL: gpu.func @convert_layout_drop_inst_data_to_null
+ // CHECK-NOT: xegpu.convert_layout
+ gpu.func @convert_layout_drop_inst_data_to_null(%arg0: vector<2xf32>) -> vector<2xf32> {
+ %0 = xegpu.convert_layout %arg0 <{input_layout = #in, target_layout = #out}> : vector<2xf32>
+ gpu.return %0 : vector<2xf32>
+ }
+}
+
+// -----
+
+#lb = #xegpu.layout<inst_data = [4, 32, 2], lane_layout = [1, 16, 1], lane_data = [4, 1, 2]>
+#b = #xegpu.layout<inst_data = [4, 16, 2], lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>
gpu.module @test_kernel {
//CHECK: gpu.func @convert_layout([[arg0:%.+]]: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
//CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<8x32x2xf16>
- //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
- //CHECK: [[m1:%.+]] = math.exp [[e1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
- //CHECK: [[r1:%.+]] = vector.insert_strided_slice [[m1]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
- //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 16, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
- //CHECK: [[m2:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
- //CHECK: [[r2:%.+]] = vector.insert_strided_slice [[m2]], [[r1]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
- //CHECK: gpu.return [[r2]] : vector<8x32x2xf16>
+ //CHECK: [[e0:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [4, 32, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<4x32x2xf16>
+ //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [4, 0, 0], sizes = [4, 32, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<4x32x2xf16>
+ //CHECK: [[c0:%.+]] = xegpu.convert_layout [[e0]] <{input_layout = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 2]>, target_layout = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>}> : vector<4x32x2xf16>
+ //CHECK: [[c1:%.+]] = xegpu.convert_layout [[e1]] <{input_layout = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 2]>, target_layout = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>}> : vector<4x32x2xf16>
+ //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[c0]] {offsets = [0, 0, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16>
+ //CHECK: [[m0:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>} : vector<4x16x2xf16>
+ //CHECK: [[i0:%.+]] = vector.insert_strided_slice [[m0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: [[e3:%.+]] = vector.extract_strided_slice [[c0]] {offsets = [0, 16, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16>
+ //CHECK: [[m1:%.+]] = math.exp [[e3]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>} : vector<4x16x2xf16>
+ //CHECK: [[i1:%.+]] = vector.insert_strided_slice [[m1]], [[i0]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: [[e4:%.+]] = vector.extract_strided_slice [[c1]] {offsets = [0, 0, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16>
+ //CHECK: [[m2:%.+]] = math.exp [[e4]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>} : vector<4x16x2xf16>
+ //CHECK: [[i2:%.+]] = vector.insert_strided_slice [[m2]], [[i1]] {offsets = [4, 0, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: [[e5:%.+]] = vector.extract_strided_slice [[c1]] {offsets = [0, 16, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16>
+ //CHECK: [[m3:%.+]] = math.exp [[e5]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>} : vector<4x16x2xf16>
+ //CHECK: [[i3:%.+]] = vector.insert_strided_slice [[m3]], [[i2]] {offsets = [4, 16, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: gpu.return [[i3]] : vector<8x32x2xf16>
gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
%b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>
>From ea708a9d3c6e2b081396a2b70944b761b609b6ca Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 28 Feb 2026 05:23:49 +0000
Subject: [PATCH 2/3] fix the needsUnroll function for ConvertLayout
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 12 +++++++++++-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 2 +-
mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 10 +++++-----
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 11 +++++++++++
4 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index fb29756e85e5f..70da453aedd64 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -248,7 +248,17 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
return tileShape.has_value() && isUnrollable(result, *tileShape);
});
- return hasUnrollableOperands || hasUnrollableResults;
+ // ConvertLayoutOp must be processed to drop the inst_data in the layout
+ bool isConvertLayoutWithInstData = false;
+ if (isa<xegpu::ConvertLayoutOp>(op)) {
+ xegpu::ConvertLayoutOp convertLayoutOp = cast<xegpu::ConvertLayoutOp>(op);
+ auto targettLayout = convertLayoutOp.getTargetLayout();
+ if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) {
+ isConvertLayoutWithInstData = true;
+ }
+ }
+ return hasUnrollableOperands || hasUnrollableResults ||
+ isConvertLayoutWithInstData;
}
void XeGPUBlockingPass::runOnOperation() {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 10cd65b080405..57f48cf0145f2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1116,7 +1116,7 @@ void LayoutInfoPropagation::visitLoadMatrixOp(
if (!hasParamsOfLayoutKind(anchorLayout)) {
VectorType resVecTy =
llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
- assert(resVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
+ assert(resVecTy.getRank() == 2 && "Expecting 2D vector for load matrix.");
const auto *uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
auto requiredAnchorLayoutAttr = xegpu::setupLoadMatrixAnchorLayout(
layoutKind, resVecTy, consumerLayoutAttr, uArch);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index cf15f373c46c7..0f6528eeeb3da 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -1056,14 +1056,13 @@ struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
inputLayout = inputLayout.dropInstData();
targetLayout = targetLayout.dropInstData();
- SmallVector<Type> convertedValTypes =
- getUnrolledTypes(valueTy, *targetShape);
- SmallVector<Value> convertedValues =
- pack(op.getOperand(), convertedValTypes, *targetShape, loc, rewriter);
-
Value newSource = op.getSource();
SmallVector<Value> newOps;
if (inputLayout && targetLayout) {
+ SmallVector<Type> convertedValTypes =
+ getUnrolledTypes(valueTy, *targetShape);
+ SmallVector<Value> convertedValues =
+ pack(op.getOperand(), convertedValTypes, *targetShape, loc, rewriter);
for (auto [v, t] : llvm::zip(convertedValues, convertedValTypes)) {
auto newOp = xegpu::ConvertLayoutOp::create(rewriter, op.getLoc(), t, v,
inputLayout, targetLayout);
@@ -1071,6 +1070,7 @@ struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
}
newSource = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
}
+
rewriter.replaceOp(op, newSource);
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index b0b910a267d86..af8615740fde0 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -615,6 +615,17 @@ gpu.module @test_kernel {
// -----
+gpu.module @test_kernel {
+ // CHECK-LABEL: gpu.func @convert_layout_drop_slice_inst_data_to_null
+ // CHECK-NOT: xegpu.convert_layout
+ gpu.func @convert_layout_drop_slice_inst_data_to_null(%arg0: vector<1xf32>) -> vector<1xf32> {
+ %0 = xegpu.convert_layout %arg0 <{input_layout = #xegpu.layout<inst_data = [1]>, target_layout = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [1, 2]>}> : vector<1xf32>
+ gpu.return %0 : vector<1xf32>
+ }
+}
+
+// -----
+
#lb = #xegpu.layout<inst_data = [4, 32, 2], lane_layout = [1, 16, 1], lane_data = [4, 1, 2]>
#b = #xegpu.layout<inst_data = [4, 16, 2], lane_layout = [1, 16, 1], lane_data = [4, 1, 1]>
>From 862df9b4e2430e2f505f76a5a81a3b59b59cb1d6 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 28 Feb 2026 15:43:31 +0000
Subject: [PATCH 3/3] remove canonicalization for converlayout op
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 1 -
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 17 -----------------
2 files changed, 18 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index f8a7c3b0f76a3..e8d1fbf6bf40c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1540,7 +1540,6 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
}];
let hasVerifier = 1;
- let hasCanonicalizer = 1;
}
class SizeInBits<string name> :
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5dafee8b66b80..e470d1f820f79 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -1113,23 +1113,6 @@ LogicalResult ConvertLayoutOp::verify() {
return mlir::success();
}
-struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
- using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
- LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
- PatternRewriter &rewriter) const override {
- if (op.getInputLayout() == op.getTargetLayout()) {
- rewriter.replaceOp(op, op.getSource());
- return success();
- }
- return failure();
- }
-};
-
-void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
- MLIRContext *context) {
- patterns.add<FoldConvertLayoutOp>(context);
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_LoadMatrixOp
//===----------------------------------------------------------------------===//
More information about the Mlir-commits
mailing list