[Mlir-commits] [mlir] [MLIR][XeGPU] Extend convert_layout op to support scalar type (PR #188874)
Jianhui Li
llvmlistbot at llvm.org
Fri Mar 27 09:32:45 PDT 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/188874
>From 1439b93ce136e08501541bc36048c13e4ce15613 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 26 Mar 2026 20:54:40 +0000
Subject: [PATCH 1/4] add scalar type to convert_layout
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 4 ++--
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 2 ++
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 20 ++++++++++---------
.../Transforms/XeGPUWgToSgDistribute.cpp | 6 ++++--
mlir/test/Dialect/XeGPU/layout.mlir | 7 +++++++
5 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 48737352497e1..3526178ea5753 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
: vector<128x128xf16>
```
}];
- let arguments = (ins XeGPU_VectorOrOffsetVectorType: $source,
+ let arguments = (ins XeGPU_ConvertLayoutType: $source,
DistributeLayoutAttr: $input_layout,
DistributeLayoutAttr: $target_layout);
- let results = (outs XeGPU_VectorOrOffsetVectorType: $result);
+ let results = (outs XeGPU_ConvertLayoutType: $result);
let assemblyFormat = [{
$source prop-dict attr-dict `:` type($source)
}];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index c50bd25df2742..f41c0bf1fd2b2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -29,6 +29,8 @@ def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
def XeGPU_VectorOrOffsetVectorType
: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
+def XeGPU_ConvertLayoutType
+ : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
def XeGPU_GatherScatterBaseAddrType
: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e470d1f820f79..5697097a4c999 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -1101,15 +1101,17 @@ LogicalResult ConvertLayoutOp::verify() {
return emitOpError("expected input layout and target layout be WgLayout or "
"SgLayout at the same time.");
- auto shape = getSource().getType().getShape();
- if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
- return emitOpError(
- "invalid input layout, data cannot be evenly distributed.");
-
- if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
- return emitOpError(
- "invalid target layout, data cannot be evenly distributed.");
-
+ Type srcType = getSource().getType();
+ if (llvm::isa<VectorType>(srcType)) {
+ auto shape = llvm::cast<VectorType>(srcType).getShape();
+ if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
+ return emitOpError(
+ "invalid input layout, data cannot be evenly distributed.");
+
+ if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
+ return emitOpError(
+ "invalid target layout, data cannot be evenly distributed.");
+ }
return mlir::success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 6dea94c0c5de3..da42da6750201 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -600,8 +600,10 @@ struct WgToSgConvertLayoutOp
ConversionPatternRewriter &rewriter) const override {
Location loc = op.getLoc();
- VectorType resultType = op.getResult().getType();
- ArrayRef<int64_t> wgShape = resultType.getShape();
+ Type resultType = op.getResult().getType();
+ ArrayRef<int64_t> wgShape;
+ if (isa<VectorType>(resultType))
+ wgShape = cast<VectorType>(resultType).getShape();
auto inputLayout = op.getInputLayout();
auto targetLayout = op.getTargetLayout();
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 29670d0b5aadd..bd332ddf4480a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -60,6 +60,13 @@ gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
gpu.return
}
+gpu.func @convert_layout_wg_scalar(%a: f16) {
+ // CHECK: xegpu.convert_layout
+ // CHECK-SAME: <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}> : f16
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}> : f16
+ gpu.return
+}
+
gpu.func @slice_attr() {
//CHECK: arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>, dims = [2]>} dense<8> : vector<16x8xindex>
>From 519326725dd054756206747069b8c3d141e82bf1 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 26 Mar 2026 21:49:54 +0000
Subject: [PATCH 2/4] enable unrolling convert_layout with scalar
---
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 21 +++++++++++++------
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 18 ++++++++++++++++
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 14 +++++++++++++
3 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index d633c1531955d..36b903c5b4303 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -1042,18 +1042,20 @@ struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
Location loc = op.getLoc();
- VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
- assert(valueTy && "the value type must be vector type!");
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
- return failure();
+ Type valType = op.getType();
xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
if (!inputLayout || !targetLayout)
return rewriter.notifyMatchFailure(op, "missing layout attributes.");
+ if (valType.isIntOrFloat()) {
+ rewriter.replaceOp(op, op.getSource());
+ assert(!inputLayout.dropInstData() && !targetLayout.dropInstData() &&
+ "unexpected layout attributes for scalar type");
+ return success();
+ }
+
if (inputLayout.getEffectiveInstDataAsInt().empty() ||
targetLayout.getEffectiveInstDataAsInt().empty())
return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
@@ -1061,6 +1063,13 @@ struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
inputLayout = inputLayout.dropInstData();
targetLayout = targetLayout.dropInstData();
+ VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+ assert(valueTy && "the value type must be vector type!");
+
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
+ return failure();
+
Value newSource = op.getSource();
SmallVector<Value> newOps;
if (inputLayout && targetLayout) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index af8615740fde0..9ca424374335f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -602,6 +602,24 @@ gpu.module @test_kernel {
// -----
+#a = #xegpu.layout<inst_data = [8, 16]>
+gpu.module @test_kernel {
+ //CHECK-LABEL: gpu.func @convert_layout_scalar
+ gpu.func @convert_layout_scalar(%arg0: memref<16x16xf16>, %arg1: memref<4xf16>) {
+ %acc = arith.constant 0.000000e+00 : f16
+ %c0 = arith.constant 0 : index
+ %a_tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a>
+ %a = xegpu.load_nd %a_tdesc {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16>
+ %a_reduce = vector.multi_reduction <add>, %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16
+ // CHECK-NOT: xegpu.convert_layout
+ %13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16
+ memref.store %13, %arg1[%c0] : memref<4xf16>
+ gpu.return
+ }
+}
+
+// -----
+
#in = #xegpu.slice<#xegpu.layout<inst_data = [1, 16]>, dims = [1]>
#out = #xegpu.slice<#xegpu.layout<inst_data = [1, 16], lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
gpu.module @test_kernel {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 950d9ba66f0cc..90c6a73497630 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -959,6 +959,20 @@ gpu.module @test_distribution {
gpu.return
}
+ // CHECK-LABEL: convert_layout_reduce_to_scalar
+ gpu.func @convert_layout_reduce_to_scalar(%arg0: memref<32x32xf32>) {
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<true> : vector<32x32xi1>
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<0> : vector<32x32xindex>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<32x32xf32> -> index
+ %10 = arith.index_cast %intptr : index to i64
+ %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>}> {layout_operand_1 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_operand_2 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32>
+ %12 = vector.multi_reduction <add>, %11, %cst_0 {layout_operand_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>} [0, 1] : vector<32x32xf32> to f32
+ // CHECK-NOT: xegpu.convert_layout
+ %13 = xegpu.convert_layout %12 <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>}> : f32
+ gpu.return
+ }
+
// CHECK-LABEL: distribute_nested_slice
// CHECK: %[[V0:.*]] = vector.shape_cast %{{.*}} : vector<32x32xf32> to vector<32x1x32x1xf32>
// CHECK: %[[V1:.*]] = vector.broadcast %[[V0]] : vector<32x1x32x1xf32> to vector<32x16x32x16xf32>
>From 4e2ced768adf93685c1be80fc831ace6a27f9d5d Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 26 Mar 2026 23:04:24 +0000
Subject: [PATCH 3/4] add subgroup distribution support for convert_layout
scalar
---
.../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 7 ++++++-
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir | 11 ++++++++---
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 69c2fb7493086..87f1c04130417 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -2083,11 +2083,16 @@ struct ConvertLayoutDistribution
PatternRewriter &rewriter) const override {
auto inputLayout = op.getInputLayoutAttr();
auto targetLayout = op.getTargetLayoutAttr();
- auto resShape = cast<VectorType>(op.getResult().getType()).getShape();
+ Type valType = op.getResult().getType();
if (!inputLayout || !targetLayout)
return rewriter.notifyMatchFailure(op, "missing layout attributes");
+ if (valType.isIntOrFloat()) {
+ rewriter.replaceOp(op, op.getSource());
+ return success();
+ }
+ auto resShape = cast<VectorType>(valType).getShape();
SmallVector<int64_t> resShapeVec(resShape.begin(), resShape.end());
if (!inputLayout.isCompatibleWith(targetLayout, resShapeVec,
xegpu::LayoutKind::Lane)) {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index fc6ccd3f3f887..f3d670bba7241 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1268,15 +1268,20 @@ gpu.func
// CHECK-NOT: xegpu.convert_layout
// CHECK: gpu.yield %{{.*}} : vector<16xf32>
gpu.func @convert_layout_removed_when_compatible(%laneid: index){
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+ %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>, f32) {
%0 = "some_op"() : () -> vector<16xf32>
+ %2 = "some_op"() : () -> f32
%1 = xegpu.convert_layout %0
<{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}>
: vector<16xf32>
- gpu.yield %1 : vector<16xf32>
+ %3 = xegpu.convert_layout %2
+ <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}>
+ : f32
+ gpu.yield %1, %3 : vector<16xf32>, f32
}
- "some_user_op"(%r) : (vector<1xf32>) -> ()
+ "some_user_op"(%r#0, %r#1) : (vector<1xf32>, f32) -> ()
gpu.return
}
}
>From 3a90aa2f2fa282806bb368a01213f79ea8f5ef77 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 27 Mar 2026 16:32:09 +0000
Subject: [PATCH 4/4] address feedback
---
.../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f21d25084cf17..0aead9172858f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -599,11 +599,6 @@ struct WgToSgConvertLayoutOp
matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Location loc = op.getLoc();
-
- Type resultType = op.getResult().getType();
- ArrayRef<int64_t> wgShape;
- if (isa<VectorType>(resultType))
- wgShape = cast<VectorType>(resultType).getShape();
auto inputLayout = op.getInputLayout();
auto targetLayout = op.getTargetLayout();
@@ -612,6 +607,16 @@ struct WgToSgConvertLayoutOp
return rewriter.notifyMatchFailure(
op, "Input and target layouts must have subgroup layout");
+ Type resultType = op.getResult().getType();
+ if (resultType.isIntOrFloat()) {
+ rewriter.replaceOp(op, op.getSource());
+ assert(!inputLayout.dropSgLayoutAndData() &&
+ !targetLayout.dropSgLayoutAndData() &&
+ "unexpected layout attributes for scalar type");
+ return success();
+ }
+
+ ArrayRef<int64_t> wgShape = cast<VectorType>(resultType).getShape();
SmallVector<int64_t> inputSgLayout =
inputLayout.getEffectiveSgLayoutAsInt();
SmallVector<int64_t> inputSgData = inputLayout.getEffectiveSgDataAsInt();
More information about the Mlir-commits
mailing list