[Mlir-commits] [mlir] [mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp (PR #146176)
Chao Chen
llvmlistbot at llvm.org
Wed Jul 23 09:06:26 PDT 2025
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/146176
>From 2e0f4dbcb5c3635904e6200cbe763b683e4e3f21 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 24 Jun 2025 19:57:09 +0000
Subject: [PATCH 01/18] update convert layout definition
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 21 +++----
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 4 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 60 +++++++++++++------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 4 ++
mlir/test/Dialect/XeGPU/invalid.mlir | 14 +----
mlir/test/Dialect/XeGPU/layout.mlir | 8 +--
6 files changed, 67 insertions(+), 44 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index daab65ec893b8..97887cef684df 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -918,21 +918,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
let summary = "Convert the layout of the input operand";
let description = [{
- `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
- the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
- as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
- lowered to WI level because that is the end result of all distributions.
+ `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to
+ the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
+ scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
+ the IR is lowered to WI level because that is the end result of all distributions.
}];
- let arguments = (ins XeGPU_Vector2DType: $source,
- XeGPU_LayoutAttr: $srcMap,
- XeGPU_LayoutAttr: $resMap
- );
- let results = (outs XeGPU_Vector2DType: $result);
+ let arguments = (ins XeGPU_VectorType: $source,
+ XeGPU_LayoutAttr: $input_layout,
+ XeGPU_LayoutAttr: $target_layout);
+ let results = (outs XeGPU_VectorType: $result);
let assemblyFormat = [{
- $source attr-dict `:` type($source)
+ $source prop-dict attr-dict `:` type($source)
}];
+ let hasFolder = 1;
let hasVerifier = 1;
+ let hasCanonicalizer = 1;
}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 84314875c2ae5..af40b3754bd8a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -21,8 +21,8 @@ def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
-def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
-def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>;
+def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
+def XeGPU_ValueType: AnyTypeOf<[XeGPU_VectorType, XeGPU_ScalarType]>;
// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2793c7a35bc97..10ce019d5a812 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -609,32 +609,58 @@ LogicalResult DpasOp::verify() {
// XeGPU_ConvertLayoutOp
//===----------------------------------------------------------------------===//
LogicalResult ConvertLayoutOp::verify() {
- auto srcMap = getSrcMapAttr();
- auto resMap = getResMapAttr();
- if (!srcMap)
- return emitOpError("expected srcMap.");
- if (!resMap)
- return emitOpError("expected resMap.");
-
- if (srcMap == resMap)
- return emitOpError("expected different srcMap and resMap.");
+ auto srcLayout = getInputLayoutAttr();
+ auto resLayout = getTargetLayoutAttr();
+ if (!srcLayout)
+ return emitOpError("expected input layout.");
+ if (!resLayout)
+ return emitOpError("expected target layout.");
// both srcMap and resMap should be WgLayout or SgLayout at the same time.
- if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
- (!srcMap.isSgLayout() || !resMap.isSgLayout()))
- return emitOpError(
- "expected srcMap and resMap be WgLayout or SgLayout at the same time.");
+ if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
+ (!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
+ return emitOpError("expected input layout and target layout be WgLayout or "
+ "SgLayout at the same time.");
auto shape = getSource().getType().getShape();
- if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap))
- return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+ if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
+ return emitOpError(
+ "invalid input layout, data cannot be evenly distributed.");
- if (!XeGPUDialect::isEvenlyDistributable(shape, resMap))
- return emitOpError("invalid resMap, data cannot be evenly distributed.");
+ if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
+ return emitOpError(
+ "invalid target layout, data cannot be evenly distributed.");
return mlir::success();
}
+OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
+ llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n";
+ auto srcLayout = getInputLayoutAttr();
+ auto resLayout = getTargetLayoutAttr();
+ if (srcLayout == resLayout)
+ return adaptor.getSource();
+ return {};
+}
+
+struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
+ using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
+ auto inputLayout = op.getInputLayoutAttr();
+ auto targetLayout = op.getTargetLayoutAttr();
+ if (inputLayout != targetLayout)
+ return failure();
+ rewriter.replaceOp(op, op.getSource());
+ return success();
+ }
+};
+
+void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
+ MLIRContext *context) {
+ patterns.add<FoldConvertLayoutOp>(context);
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 6b85a66a8bd36..aa1755e25996a 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
+ // For ConvertLayoutOp, the layout is stored in the tensor descriptor
+ if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
+ return convertOp.getTargetLayoutAttr();
+
// for LoadNdOp, the layout is stored in the tensor descriptor
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index a2778cd94d963..65e1d22449bdd 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -511,19 +511,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
return
}
-// -----
-func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected different srcMap and resMap}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
- gpu.return
-}
-
// -----
func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ // expected-error at +1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 7f3ebec225cdf..ef51dfbbfd574 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -35,14 +35,14 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}
gpu.func @convert_layout(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
>From 9e89e7279a56816b54f5eb5ce1fc9ed3fcde0576 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 24 Jun 2025 21:16:38 +0000
Subject: [PATCH 02/18] add convert layout blocking pattern
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 12 ++++--------
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 11 +++++++++++
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 10ce019d5a812..54b1e360d11f1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -609,8 +609,8 @@ LogicalResult DpasOp::verify() {
// XeGPU_ConvertLayoutOp
//===----------------------------------------------------------------------===//
LogicalResult ConvertLayoutOp::verify() {
- auto srcLayout = getInputLayoutAttr();
- auto resLayout = getTargetLayoutAttr();
+ auto srcLayout = getInputLayout();
+ auto resLayout = getTargetLayout();
if (!srcLayout)
return emitOpError("expected input layout.");
if (!resLayout)
@@ -636,9 +636,7 @@ LogicalResult ConvertLayoutOp::verify() {
OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n";
- auto srcLayout = getInputLayoutAttr();
- auto resLayout = getTargetLayoutAttr();
- if (srcLayout == resLayout)
+ if (getInputLayout() == getTargetLayout())
return adaptor.getSource();
return {};
}
@@ -647,9 +645,7 @@ struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
- auto inputLayout = op.getInputLayoutAttr();
- auto targetLayout = op.getTargetLayoutAttr();
- if (inputLayout != targetLayout)
+ if (op.getInputLayout() != op.getTargetLayout())
return failure();
rewriter.replaceOp(op, op.getSource());
return success();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 3950e8f70d1ca..bf6d0b3164e16 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -78,6 +78,17 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
+struct ConvertLayoutOpPattern: public OpRewritePattern<xegpu::ConvertLayoutOp> {
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override {
+ xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData();
+ xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData();
+ auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
+ rewriter.replaceOp(op, newOp);
+ return success();
+ }
+};
+
//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
>From 149aeeaa3148f98d378177ccb64c8941a41d8dd4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 25 Jun 2025 15:00:36 +0000
Subject: [PATCH 03/18] add WgToSg pattern for convert layout
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 +-
.../XeGPU/Transforms/XeGPUBlocking.cpp | 1 +
.../Transforms/XeGPUWgToSgDistribute.cpp | 34 +++++++++++++++++--
3 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 54b1e360d11f1..00fe251f48757 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -635,9 +635,8 @@ LogicalResult ConvertLayoutOp::verify() {
}
OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
- llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n";
if (getInputLayout() == getTargetLayout())
- return adaptor.getSource();
+ return getSource();
return {};
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index bf6d0b3164e16..3472bceca40ce 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -346,6 +346,7 @@ void XeGPUBlockingPass::runOnOperation() {
});
RewritePatternSet patterns(ctx);
+ patterns.add<ConvertLayoutOpPattern>(ctx);
vector::UnrollVectorOptions vectorOptions;
vectorOptions.setNativeShapeFn(options.nativeShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index e3563d10bc6f1..fa45169021581 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -390,6 +390,31 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};
+struct WgToSgConvertLayoutOp
+ : public OpConversionPattern<xegpu::ConvertLayoutOp> {
+ using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
+ LogicalResult
+ matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::LayoutAttr input = op.getInputLayout();
+ xegpu::LayoutAttr target = op.getTargetLayout();
+ if (input.getSgLayout() == target.getSgLayout() &&
+ input.getSgData() == target.getSgData()) {
+ input = input.dropSgLayoutAndData();
+ target = target.dropSgLayoutAndData();
+ SmallVector<Value> newOps;
+ for (auto src : adaptor.getSource()) {
+ auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
+ op.getLoc(), src.getType(), src, input, target);
+ newOps.push_back(newOp);
+ }
+ rewriter.replaceOpWithMultiple(op, newOps);
+ return success();
+ }
+ return failure();
+ }
+};
+
// Handles UnrealizedConversionCastOp generated during
// SCFStructuralTypeConversions (step 1). This op may appear as either a
// target or source materialization for Vector values, e.g.:
@@ -473,8 +498,8 @@ namespace xegpu {
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
- UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
- patterns.getContext());
+ UnrealizedConversionCastOpPattern, WgToSgElementwiseOp,
+ WgToSgConvertLayoutOp>(patterns.getContext());
}
} // namespace xegpu
} // namespace mlir
@@ -581,6 +606,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return isLegal(layout);
});
+ target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
+ [=](xegpu::ConvertLayoutOp op) -> bool {
+ return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
+ });
+
target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
[=](Operation *op) -> std::optional<bool> {
// Only handle elementwise mappable ops
>From aee53c4cff7abc4665598c8ee9689456cc373889 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 27 Jun 2025 23:26:05 +0000
Subject: [PATCH 04/18] improve ConvertLayoutOpPattern
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 14 +-
.../Transforms/XeGPUWgToSgDistribute.cpp | 206 ++++++++++++++----
2 files changed, 168 insertions(+), 52 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ef61de190b4c..6249d0484c215 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -313,13 +313,13 @@ LogicalResult TensorDescType::verify(
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
- auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
- if (blockAttr) {
- MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
- if (rank == 2 && memorySpaceAttr &&
- memorySpaceAttr.getValue() == MemorySpace::SLM)
- return emitError() << "SLM is not supported for 2D block tensor";
- }
+ // auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
+ // if (blockAttr) {
+ // MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
+ // if (rank == 2 && memorySpaceAttr &&
+ // memorySpaceAttr.getValue() == MemorySpace::SLM)
+ // return emitError() << "SLM is not supported for 2D block tensor";
+ // }
// for gather and scatter ops, Low-precision types are packed in 32-bit units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index fa45169021581..d542fb219a7c7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -57,6 +57,39 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
return std::make_pair(sgShape, count);
}
+// Calculate offset for each subgroup
+static SmallVector<OpFoldResult>
+calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
+ const SmallVector<OpFoldResult> &originalOffsets,
+ const SmallVector<Value> &localOffset,
+ const SmallVector<int64_t> &distUnitBaseAddr,
+ const SmallVector<int64_t> &distUnitShape) {
+ assert(localOffset.size() == distUnitBaseAddr.size() &&
+ "localOffset and distUnitBaseAddr must have the same rank");
+
+ SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
+ originalOffsets.end());
+ size_t rank = localOffset.size();
+ for (size_t i = 0; i < rank; ++i) {
+ size_t dimIdx = originalOffsets.size() - rank + i;
+ Value constOffset =
+ rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
+ Value offset =
+ rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
+ Value modValue =
+ rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
+ Value offsetMod =
+ rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
+ Value origOffset =
+ getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]);
+ Value globalOffset =
+ rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
+ globalOffsets[dimIdx] = globalOffset;
+ }
+
+ return globalOffsets;
+}
+
/// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
/// from a workgroup descriptor. It replaces the offsets and sizes with
/// appropriate values for the subgroup.
@@ -105,39 +138,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
- // Calculate offset for each subgroup
- SmallVector<OpFoldResult>
- calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
- const SmallVector<OpFoldResult> &originalOffsets,
- const SmallVector<Value> &localOffset,
- const SmallVector<int64_t> &distUnitBaseAddr,
- const SmallVector<int64_t> &distUnitShape) const {
- assert(localOffset.size() == distUnitBaseAddr.size() &&
- "localOffset and distUnitBaseAddr must have the same rank");
-
- SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
- originalOffsets.end());
- size_t rank = localOffset.size();
- for (size_t i = 0; i < rank; ++i) {
- size_t dimIdx = originalOffsets.size() - rank + i;
- Value constOffset =
- rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
- Value offset =
- rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
- Value modValue =
- rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
- Value offsetMod =
- rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
- Value origOffset = getValueOrCreateConstantIndexOp(
- rewriter, loc, originalOffsets[dimIdx]);
- Value globalOffset =
- rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
- globalOffsets[dimIdx] = globalOffset;
- }
-
- return globalOffsets;
- }
-
LogicalResult
matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
@@ -390,6 +390,21 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};
+// based on the size of the given vector type
+static TypedValue<MemRefType>
+allocateSLMBuffer(ConversionPatternRewriter &rewriter, Location loc,
+ VectorType type) {
+ int64_t bits = type.getElementType().getIntOrFloatBitWidth();
+ int64_t slmSizeInBytes = type.getNumElements() * bits / 8;
+ auto slmTy = MemRefType::get(slmSizeInBytes, rewriter.getI8Type(), {}, 3);
+ auto slm = rewriter.create<memref::AllocOp>(loc, slmTy);
+ auto viewTy = MemRefType::get(type.getShape(), type.getElementType(), {}, 3);
+ auto view = rewriter.create<memref::ViewOp>(
+ loc, viewTy, slm, rewriter.create<arith::ConstantIndexOp>(loc, 0),
+ ValueRange());
+ return view;
+}
+
struct WgToSgConvertLayoutOp
: public OpConversionPattern<xegpu::ConvertLayoutOp> {
using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
@@ -398,20 +413,121 @@ struct WgToSgConvertLayoutOp
ConversionPatternRewriter &rewriter) const override {
xegpu::LayoutAttr input = op.getInputLayout();
xegpu::LayoutAttr target = op.getTargetLayout();
- if (input.getSgLayout() == target.getSgLayout() &&
- input.getSgData() == target.getSgData()) {
- input = input.dropSgLayoutAndData();
- target = target.dropSgLayoutAndData();
- SmallVector<Value> newOps;
- for (auto src : adaptor.getSource()) {
- auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
- op.getLoc(), src.getType(), src, input, target);
- newOps.push_back(newOp);
+
+ if (!input || !target || !input.isWgLayout() || !target.isWgLayout())
+ return rewriter.notifyMatchFailure(
+ op, "Input and target layouts must have subgroup layout");
+
+ // initialize values with the source values
+ SmallVector<Value> values(adaptor.getSource());
+
+ Location loc = op.getLoc();
+ MLIRContext *ctx = op.getContext();
+ VectorType type = op.getResult().getType();
+ ArrayRef<int64_t> shape = type.getShape();
+
+ DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
+ DenseI32ArrayAttr inputSgData = input.getSgData();
+ DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
+ DenseI32ArrayAttr targetSgData = target.getSgData();
+
+ // we only need SLM support when input and target layouts are different
+ if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) {
+ values.clear();
+ rewriter.setInsertionPoint(op);
+ TypedValue<MemRefType> slmBuffer = allocateSLMBuffer(rewriter, loc, type);
+
+ auto linearSgId = rewriter.create<gpu::SubgroupIdOp>(
+ loc, rewriter.getIndexType(), nullptr);
+
+ { // store to slm buffer
+ SmallVector<int64_t> sgLayout =
+ llvm::to_vector_of<int64_t>(input.getSgLayout().asArrayRef());
+ SmallVector<int64_t> sgShape = getSgShapeAndCount(shape, input).first;
+ auto delinearized = affine::delinearizeIndex(
+ rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout));
+ if (failed(delinearized))
+ return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId");
+ SmallVector<Value> sgIds = *delinearized;
+
+ SmallVector<int64_t> distUnitShape(sgLayout.size());
+ SmallVector<Value> localOffset(sgLayout.size());
+ for (size_t i = 0; i < sgLayout.size(); i++) {
+ distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]);
+ localOffset[i] = rewriter.createOrFold<index::MulOp>(
+ loc, sgIds[i],
+ rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]));
+ }
+
+ auto tdescTy = xegpu::TensorDescType::get(
+ sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM,
+ input.dropSgLayoutAndData());
+
+ SmallVector<OpFoldResult> zeros = getAsIndexOpFoldResult(
+ ctx, SmallVector<int64_t>(sgLayout.size(), 0));
+ for (auto [data, baseOffsets] :
+ llvm::zip_equal(adaptor.getSource(),
+ StaticTileOffsetRange(shape, distUnitShape))) {
+ SmallVector<OpFoldResult> offsets = calculateGlobalOffsets(
+ rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape);
+ auto tdesc = rewriter.create<xegpu::CreateNdDescOp>(
+ loc, tdescTy, slmBuffer, offsets);
+ rewriter.create<xegpu::StoreNdOp>(loc, data, tdesc, nullptr, nullptr,
+ nullptr);
+ }
+ }
+
+ rewriter.create<gpu::BarrierOp>(loc);
+
+ { // load from SLM
+ SmallVector<int64_t> sgLayout =
+ llvm::to_vector_of<int64_t>(target.getSgLayout().asArrayRef());
+ SmallVector<int64_t> sgShape = getSgShapeAndCount(shape, target).first;
+ auto delinearized = affine::delinearizeIndex(
+ rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout));
+ if (failed(delinearized))
+ return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId");
+ SmallVector<Value> sgIds = *delinearized;
+
+ SmallVector<int64_t> distUnitShape(sgLayout.size());
+ SmallVector<Value> localOffset(sgLayout.size());
+ for (size_t i = 0; i < sgLayout.size(); i++) {
+ distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]);
+ localOffset[i] = rewriter.createOrFold<index::MulOp>(
+ loc, sgIds[i],
+ rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]));
+ }
+
+ auto tdescTy = xegpu::TensorDescType::get(
+ sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM,
+ target.dropSgLayoutAndData());
+ auto valueTy = VectorType::get(sgShape, type.getElementType());
+
+ SmallVector<OpFoldResult> zeros = getAsIndexOpFoldResult(
+ ctx, SmallVector<int64_t>(sgLayout.size(), 0));
+ for (auto baseOffsets : StaticTileOffsetRange(shape, distUnitShape)) {
+ SmallVector<OpFoldResult> offsets = calculateGlobalOffsets(
+ rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape);
+ auto tdesc = rewriter.create<xegpu::CreateNdDescOp>(
+ loc, tdescTy, slmBuffer, offsets);
+ auto newOp = rewriter.create<xegpu::LoadNdOp>(
+ loc, TypeRange({valueTy}), ValueRange({tdesc}));
+ values.push_back(newOp);
+ }
}
- rewriter.replaceOpWithMultiple(op, newOps);
- return success();
}
- return failure();
+
+ input = input.dropSgLayoutAndData();
+ target = target.dropSgLayoutAndData();
+
+ SmallVector<Value> newOps;
+ for (auto src : values) {
+ auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
+ op.getLoc(), src.getType(), src, input, target);
+ newOps.push_back(newOp);
+ }
+ rewriter.replaceOpWithMultiple(op, newOps);
+ return success();
}
};
>From c416cec159b701fbd405b049be1330f6ee24afc7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 30 Jun 2025 15:07:33 +0000
Subject: [PATCH 05/18] code format
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 3472bceca40ce..06e0c6105df58 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -78,12 +78,15 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
-struct ConvertLayoutOpPattern: public OpRewritePattern<xegpu::ConvertLayoutOp> {
+struct ConvertLayoutOpPattern
+ : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern::OpRewritePattern;
- LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override {
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData();
xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData();
- auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
+ auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
+ op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
rewriter.replaceOp(op, newOp);
return success();
}
>From 65b5dbd5745c662f64212c7803af45fe9605f5ca Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 9 Jul 2025 22:11:37 +0000
Subject: [PATCH 06/18] refactor ConvertLayoutPattern for wg to sg.
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 14 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 11 +-
.../Transforms/XeGPUWgToSgDistribute.cpp | 193 +++++-------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 2 +-
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 10 +
5 files changed, 70 insertions(+), 160 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 6249d0484c215..7ef61de190b4c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -313,13 +313,13 @@ LogicalResult TensorDescType::verify(
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
- // auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
- // if (blockAttr) {
- // MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
- // if (rank == 2 && memorySpaceAttr &&
- // memorySpaceAttr.getValue() == MemorySpace::SLM)
- // return emitError() << "SLM is not supported for 2D block tensor";
- // }
+ auto blockAttr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
+ if (blockAttr) {
+ MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace();
+ if (rank == 2 && memorySpaceAttr &&
+ memorySpaceAttr.getValue() == MemorySpace::SLM)
+ return emitError() << "SLM is not supported for 2D block tensor";
+ }
// for gather and scatter ops, Low-precision types are packed in 32-bit units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 00fe251f48757..03e3aed3fa8ce 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -616,7 +616,7 @@ LogicalResult ConvertLayoutOp::verify() {
if (!resLayout)
return emitOpError("expected target layout.");
- // both srcMap and resMap should be WgLayout or SgLayout at the same time.
+ // both input and target layouts should be WgLayout or SgLayout at the same time.
if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
(!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
return emitOpError("expected input layout and target layout be WgLayout or "
@@ -644,10 +644,11 @@ struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
- if (op.getInputLayout() != op.getTargetLayout())
- return failure();
- rewriter.replaceOp(op, op.getSource());
- return success();
+ if (op.getInputLayout() == op.getTargetLayout()) {
+ rewriter.replaceOp(op, op.getSource());
+ return success();
+ }
+ return failure();
}
};
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index d542fb219a7c7..ed393309b3309 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -57,39 +57,6 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
return std::make_pair(sgShape, count);
}
-// Calculate offset for each subgroup
-static SmallVector<OpFoldResult>
-calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
- const SmallVector<OpFoldResult> &originalOffsets,
- const SmallVector<Value> &localOffset,
- const SmallVector<int64_t> &distUnitBaseAddr,
- const SmallVector<int64_t> &distUnitShape) {
- assert(localOffset.size() == distUnitBaseAddr.size() &&
- "localOffset and distUnitBaseAddr must have the same rank");
-
- SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
- originalOffsets.end());
- size_t rank = localOffset.size();
- for (size_t i = 0; i < rank; ++i) {
- size_t dimIdx = originalOffsets.size() - rank + i;
- Value constOffset =
- rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
- Value offset =
- rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
- Value modValue =
- rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
- Value offsetMod =
- rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
- Value origOffset =
- getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]);
- Value globalOffset =
- rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
- globalOffsets[dimIdx] = globalOffset;
- }
-
- return globalOffsets;
-}
-
/// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
/// from a workgroup descriptor. It replaces the offsets and sizes with
/// appropriate values for the subgroup.
@@ -138,6 +105,39 @@ calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+ // Calculate offset for each subgroup
+ static SmallVector<OpFoldResult>
+ calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
+ const SmallVector<OpFoldResult> &originalOffsets,
+ const SmallVector<Value> &localOffset,
+ const SmallVector<int64_t> &distUnitBaseAddr,
+ const SmallVector<int64_t> &distUnitShape) {
+ assert(localOffset.size() == distUnitBaseAddr.size() &&
+ "localOffset and distUnitBaseAddr must have the same rank");
+
+ SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
+ originalOffsets.end());
+ size_t rank = localOffset.size();
+ for (size_t i = 0; i < rank; ++i) {
+ size_t dimIdx = originalOffsets.size() - rank + i;
+ Value constOffset =
+ rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
+ Value offset =
+ rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
+ Value modValue =
+ rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
+ Value offsetMod =
+ rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
+ Value origOffset =
+ getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]);
+ Value globalOffset =
+ rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
+ globalOffsets[dimIdx] = globalOffset;
+ }
+
+ return globalOffsets;
+ }
+
LogicalResult
matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
@@ -390,21 +390,6 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};
-// based on the size of the given vector type
-static TypedValue<MemRefType>
-allocateSLMBuffer(ConversionPatternRewriter &rewriter, Location loc,
- VectorType type) {
- int64_t bits = type.getElementType().getIntOrFloatBitWidth();
- int64_t slmSizeInBytes = type.getNumElements() * bits / 8;
- auto slmTy = MemRefType::get(slmSizeInBytes, rewriter.getI8Type(), {}, 3);
- auto slm = rewriter.create<memref::AllocOp>(loc, slmTy);
- auto viewTy = MemRefType::get(type.getShape(), type.getElementType(), {}, 3);
- auto view = rewriter.create<memref::ViewOp>(
- loc, viewTy, slm, rewriter.create<arith::ConstantIndexOp>(loc, 0),
- ValueRange());
- return view;
-}
-
struct WgToSgConvertLayoutOp
: public OpConversionPattern<xegpu::ConvertLayoutOp> {
using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
@@ -418,115 +403,29 @@ struct WgToSgConvertLayoutOp
return rewriter.notifyMatchFailure(
op, "Input and target layouts must have subgroup layout");
- // initialize values with the source values
- SmallVector<Value> values(adaptor.getSource());
-
- Location loc = op.getLoc();
- MLIRContext *ctx = op.getContext();
- VectorType type = op.getResult().getType();
- ArrayRef<int64_t> shape = type.getShape();
-
DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
DenseI32ArrayAttr inputSgData = input.getSgData();
DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
DenseI32ArrayAttr targetSgData = target.getSgData();
- // we only need SLM support when input and target layouts are different
- if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) {
- values.clear();
- rewriter.setInsertionPoint(op);
- TypedValue<MemRefType> slmBuffer = allocateSLMBuffer(rewriter, loc, type);
-
- auto linearSgId = rewriter.create<gpu::SubgroupIdOp>(
- loc, rewriter.getIndexType(), nullptr);
-
- { // store to slm buffer
- SmallVector<int64_t> sgLayout =
- llvm::to_vector_of<int64_t>(input.getSgLayout().asArrayRef());
- SmallVector<int64_t> sgShape = getSgShapeAndCount(shape, input).first;
- auto delinearized = affine::delinearizeIndex(
- rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout));
- if (failed(delinearized))
- return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId");
- SmallVector<Value> sgIds = *delinearized;
-
- SmallVector<int64_t> distUnitShape(sgLayout.size());
- SmallVector<Value> localOffset(sgLayout.size());
- for (size_t i = 0; i < sgLayout.size(); i++) {
- distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]);
- localOffset[i] = rewriter.createOrFold<index::MulOp>(
- loc, sgIds[i],
- rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]));
- }
-
- auto tdescTy = xegpu::TensorDescType::get(
- sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM,
- input.dropSgLayoutAndData());
-
- SmallVector<OpFoldResult> zeros = getAsIndexOpFoldResult(
- ctx, SmallVector<int64_t>(sgLayout.size(), 0));
- for (auto [data, baseOffsets] :
- llvm::zip_equal(adaptor.getSource(),
- StaticTileOffsetRange(shape, distUnitShape))) {
- SmallVector<OpFoldResult> offsets = calculateGlobalOffsets(
- rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape);
- auto tdesc = rewriter.create<xegpu::CreateNdDescOp>(
- loc, tdescTy, slmBuffer, offsets);
- rewriter.create<xegpu::StoreNdOp>(loc, data, tdesc, nullptr, nullptr,
- nullptr);
- }
- }
-
- rewriter.create<gpu::BarrierOp>(loc);
-
- { // load from SLM
- SmallVector<int64_t> sgLayout =
- llvm::to_vector_of<int64_t>(target.getSgLayout().asArrayRef());
- SmallVector<int64_t> sgShape = getSgShapeAndCount(shape, target).first;
- auto delinearized = affine::delinearizeIndex(
- rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout));
- if (failed(delinearized))
- return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId");
- SmallVector<Value> sgIds = *delinearized;
-
- SmallVector<int64_t> distUnitShape(sgLayout.size());
- SmallVector<Value> localOffset(sgLayout.size());
- for (size_t i = 0; i < sgLayout.size(); i++) {
- distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]);
- localOffset[i] = rewriter.createOrFold<index::MulOp>(
- loc, sgIds[i],
- rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]));
- }
-
- auto tdescTy = xegpu::TensorDescType::get(
- sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM,
- target.dropSgLayoutAndData());
- auto valueTy = VectorType::get(sgShape, type.getElementType());
-
- SmallVector<OpFoldResult> zeros = getAsIndexOpFoldResult(
- ctx, SmallVector<int64_t>(sgLayout.size(), 0));
- for (auto baseOffsets : StaticTileOffsetRange(shape, distUnitShape)) {
- SmallVector<OpFoldResult> offsets = calculateGlobalOffsets(
- rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape);
- auto tdesc = rewriter.create<xegpu::CreateNdDescOp>(
- loc, tdescTy, slmBuffer, offsets);
- auto newOp = rewriter.create<xegpu::LoadNdOp>(
- loc, TypeRange({valueTy}), ValueRange({tdesc}));
- values.push_back(newOp);
- }
- }
- }
+ // TODO: currently we only support for optimal case, where input and
+ // output has the same sg_layout and sg_data, so SLM is not involved.
+ if (inputSgLayout != targetSgLayout || inputSgData != targetSgData)
+ return failure();
input = input.dropSgLayoutAndData();
target = target.dropSgLayoutAndData();
- SmallVector<Value> newOps;
- for (auto src : values) {
- auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
- op.getLoc(), src.getType(), src, input, target);
- newOps.push_back(newOp);
+ SmallVector<Value> newOps(adaptor.getSource());
+
+ if (input && target) {
+ for (auto [i, src] : llvm::enumerate(adaptor.getSource())) {
+ auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
+ op.getLoc(), src.getType(), src, input, target);
+ newOps[i] = newOp;
+ }
}
- rewriter.replaceOpWithMultiple(op, newOps);
+ rewriter.replaceOpWithMultiple(op, {newOps});
return success();
}
};
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index aa1755e25996a..d5ae3c20e222e 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -124,7 +124,7 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
- // For ConvertLayoutOp, the layout is stored in the tensor descriptor
+ // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
return convertOp.getTargetLayoutAttr();
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index c6124f90e0f48..6c688f4db6dec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -198,4 +198,14 @@ gpu.module @test_round_robin_assignment {
gpu.return
}
+ gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
+ %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
+ //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
+ //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+ %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
+ gpu.return
+ }
+
}
>From ec4e7ad1ec94efb0fab31daf412cddfd9e439753 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 9 Jul 2025 22:16:02 +0000
Subject: [PATCH 07/18] fix format issue
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 ++-
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 4 ++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 03e3aed3fa8ce..97415cc74f928 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -616,7 +616,8 @@ LogicalResult ConvertLayoutOp::verify() {
if (!resLayout)
return emitOpError("expected target layout.");
- // both input and target layouts should be WgLayout or SgLayout at the same time.
+ // both input and target layouts should be WgLayout or SgLayout at the same
+ // time.
if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
(!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
return emitOpError("expected input layout and target layout be WgLayout or "
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index ed393309b3309..89dcddec752a1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -128,8 +128,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
Value offsetMod =
rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
- Value origOffset =
- getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]);
+ Value origOffset = getValueOrCreateConstantIndexOp(
+ rewriter, loc, originalOffsets[dimIdx]);
Value globalOffset =
rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
globalOffsets[dimIdx] = globalOffset;
>From b9c02fcc90fa402a4917741a0b5620b8fd305e6a Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 11 Jul 2025 16:50:55 +0000
Subject: [PATCH 08/18] fix a bug
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index d5ae3c20e222e..bdc80d78f202a 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -141,7 +141,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
auto parentOp = arg.getOwner()->getParentOp();
if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
- return getLayoutAttr(tiedInit->get());
+ if (tiedInit)
+ return getLayoutAttr(tiedInit->get());
}
}
@@ -178,11 +179,15 @@ void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
for (OpOperand &opr : nestOp->getOpOperands()) {
+ llvm::dbgs() << "set layout for: " << opr.get();
auto layout = getLayoutImpl(opr.get());
+ llvm::dbgs() << " with: " << layout << "\n";
setLayoutAttr(opr, layout);
}
for (OpResult result : nestOp->getOpResults()) {
+ llvm::dbgs() << "set layout for: " << result;
auto layout = getLayoutImpl(result);
+ llvm::dbgs() << " with: " << layout << "\n";
setLayoutAttr(result, layout);
}
});
>From d8035af2f8f548452030a171c65d1926c9d59ae7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 11 Jul 2025 16:53:07 +0000
Subject: [PATCH 09/18] clean up
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index bdc80d78f202a..8922c07f24067 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -179,15 +179,11 @@ void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
for (OpOperand &opr : nestOp->getOpOperands()) {
- llvm::dbgs() << "set layout for: " << opr.get();
auto layout = getLayoutImpl(opr.get());
- llvm::dbgs() << " with: " << layout << "\n";
setLayoutAttr(opr, layout);
}
for (OpResult result : nestOp->getOpResults()) {
- llvm::dbgs() << "set layout for: " << result;
auto layout = getLayoutImpl(result);
- llvm::dbgs() << " with: " << layout << "\n";
setLayoutAttr(result, layout);
}
});
>From da7f78a4cfca235e26d6c23e2eaa07bc760cce5d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 17 Jul 2025 15:53:58 +0000
Subject: [PATCH 10/18] fix merge issue
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 277158ac85409..708fc7013b1a2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -22,7 +22,7 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
-def XeGPU_Vector2DType: FixedVectorOfRankAndType<[2], [XeGPU_ScalarType]>;
+def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
>From 0b42c3b96ae5972a3617676f8c9ff61e1891b156 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 17 Jul 2025 16:53:24 +0000
Subject: [PATCH 11/18] add check statement
---
mlir/test/Dialect/XeGPU/layout.mlir | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index ef51dfbbfd574..017dacc8d629a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -35,12 +35,16 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}
gpu.func @convert_layout(%a: vector<32x64xf16>) {
+ // CHECK: xegpu.convert_layout
+ // CHECK-SAME: <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
+ // CHECK: xegpu.convert_layout
+ // CHECK-SAME: <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
>From cad98ad4eae4d52dbaa124413244edd45046c45d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 18 Jul 2025 14:58:29 +0000
Subject: [PATCH 12/18] add convert layout test for blocking
---
.../XeGPU/Transforms/XeGPUBlocking.cpp | 9 ++++-
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 38 +++++++++++++++++++
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 1 -
3 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 53f2bf316e370..40a6678ace58f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -81,8 +81,13 @@ struct ConvertLayoutOpPattern
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
- xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData();
- xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData();
+ xegpu::LayoutAttr input_layout = op.getInputLayoutAttr();
+ xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr();
+ if (!input_layout.getInstData() || !target_layout.getInstData())
+ return rewriter.notifyMatchFailure(op, "Skip non-target ConvertLayoutOp.");
+
+ input_layout = input_layout.dropInstData();
+ target_layout = target_layout.dropInstData();
auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
rewriter.replaceOp(op, newOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index e820e13f09f64..2ce4c10f1ac68 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -500,3 +500,41 @@ gpu.module @test_kernel {
gpu.return
}
}
+
+// -----
+#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
+#c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+
+gpu.module @test_kernel {
+ //CHECK-LABEL: gpu.func @convert_layout
+ //CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32>
+ //CHECK: [[c8:%.+]] = arith.constant 8 : index
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
+ //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
+ //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
+ //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+
+ gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) {
+ %c0 = arith.constant 0 : index
+ %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
+ %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
+ xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
+ gpu.return
+ }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 6c688f4db6dec..949b6205386ac 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -207,5 +207,4 @@ gpu.module @test_round_robin_assignment {
target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
gpu.return
}
-
}
>From 4568657d77d4529358b4e783dd3af8c3f06fba30 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 18 Jul 2025 15:17:39 +0000
Subject: [PATCH 13/18] fix format
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 40a6678ace58f..d8e8aed3cf8de 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -84,7 +84,7 @@ struct ConvertLayoutOpPattern
xegpu::LayoutAttr input_layout = op.getInputLayoutAttr();
xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr();
if (!input_layout.getInstData() || !target_layout.getInstData())
- return rewriter.notifyMatchFailure(op, "Skip non-target ConvertLayoutOp.");
+ return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
input_layout = input_layout.dropInstData();
target_layout = target_layout.dropInstData();
>From ff72cb584185ca6f1de87fe257ac0a2c3251c4fe Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 18 Jul 2025 17:53:12 +0000
Subject: [PATCH 14/18] update comments
---
.../Transforms/XeGPUWgToSgDistribute.cpp | 26 +++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 04e0a25929a6f..f1f4fedf846ad 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -392,6 +392,32 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};
+// clang-format off
+// Pattern for lowering ConvertLayoutOp based on sg_layout and sg_data.
+// If input_layout and target_layout have identical sg_layout and sg_data,
+// the op is rewritten to a subgroup-level ConvertLayoutOp with these fields
+// dropped. For example:
+// #a = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>
+// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
+// becomes:
+// #a = #xegpu.layout<inst_data = [16, 16]>
+// #b = #xegpu.layout<inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<16x16xf32>
+// (vector<16x16xf32> is determined by sg_data = [16, 16])
+//
+// If sg_layout or sg_data differ, SLM is used to redistribute data across subgroups.
+// For example:
+// #a = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 16], inst_data = [16, 16]>
+// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 32], inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
+// is lowered to:
+// #a = #xegpu.layout<inst_data = [16, 16]>
+// #b = #xegpu.layout<inst_data = [8, 16]>
+// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, metrix_desc<32x64xf32>
+// %d = load_matrix %slm <{layout_result_0 = #a}> : metrix_desc<32x64xf32> -> vector<16x32xf32>
+// xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32>
+// clang-format on
struct WgToSgConvertLayoutOp
: public OpConversionPattern<xegpu::ConvertLayoutOp> {
using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
>From cf59d7a2c6c9adecdcb521cef4386ff4f8ff7bf3 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 18 Jul 2025 18:17:15 +0000
Subject: [PATCH 15/18] address comments
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 5 +++++
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 2 +-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index d8e8aed3cf8de..6d8fd4509794c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -76,6 +76,11 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
+// This pattern rewrites ConvertLayoutOp by removing the inst_data field
+// from the layout attributes. As the surrounding extract_strided_slice
+// and insert_strided_slice operations reconstruct the original value
+// shape and partition it into instruction-sized tiles, its lowering is
+// simply to drop the inst_data field for ConvertLayoutOp.
struct ConvertLayoutOpPattern
: public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern::OpRewritePattern;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index f1f4fedf846ad..2c0587024f200 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -445,8 +445,8 @@ struct WgToSgConvertLayoutOp
target = target.dropSgLayoutAndData();
SmallVector<Value> newOps(adaptor.getSource());
-
if (input && target) {
+ // keep the ConvertLayoutOp for rest fields, e.g., inst_data.
for (auto [i, src] : llvm::enumerate(adaptor.getSource())) {
auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
op.getLoc(), src.getType(), src, input, target);
>From 8e79477bc7d435a3b9a9721b49e694a02331694e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 15:43:33 +0000
Subject: [PATCH 16/18] fix comments
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 9 ++++-----
.../Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 5 ++++-
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 6d8fd4509794c..4656f112958b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -76,11 +76,10 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
-// This pattern rewrites ConvertLayoutOp by removing the inst_data field
-// from the layout attributes. As the surrounding extract_strided_slice
-// and insert_strided_slice operations reconstruct the original value
-// shape and partition it into instruction-sized tiles, its lowering is
-// simply to drop the inst_data field for ConvertLayoutOp.
+// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
+// layout attributes. Since both producer and consumer operations handle data
+// partitioning based on their own inst_data, while maintaining original input
+// and output shape, ConvertLayoutOp does not need to manage inst_data.
struct ConvertLayoutOpPattern
: public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern::OpRewritePattern;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 2c0587024f200..519d48382b8e1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -433,12 +433,15 @@ struct WgToSgConvertLayoutOp
DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
DenseI32ArrayAttr inputSgData = input.getSgData();
+ DenseI32ArrayAttr inputOrder = input.getOrder();
DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
DenseI32ArrayAttr targetSgData = target.getSgData();
+ DenseI32ArrayAttr targetOrder = target.getOrder();
// TODO: currently we only support for optimal case, where input and
// output has the same sg_layout and sg_data, so SLM is not involved.
- if (inputSgLayout != targetSgLayout || inputSgData != targetSgData)
+ if (inputSgLayout != targetSgLayout || inputSgData != targetSgData ||
+ inputOrder != targetOrder)
return failure();
input = input.dropSgLayoutAndData();
>From 727fc0b3bddf81a1840075380164c8ada3e06358 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 22 Jul 2025 16:07:42 +0000
Subject: [PATCH 17/18] add unit test for 3D convert layout
---
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 23 +++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 2ce4c10f1ac68..d986e5bd1cfb4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -538,3 +538,26 @@ gpu.module @test_kernel {
gpu.return
}
}
+
+// -----
+
+#lb = #xegpu.layout<inst_data = [8, 32, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
+#b = #xegpu.layout<inst_data = [8, 16, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
+
+gpu.module @test_kernel {
+ //CHECK: gpu.func @convert_layout([[arg0:%.+]]: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
+ //CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<8x32x2xf16>
+ //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
+ //CHECK: [[m1:%.+]] = math.exp [[e1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
+ //CHECK: [[r1:%.+]] = vector.insert_strided_slice [[m1]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 16, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
+ //CHECK: [[m2:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
+ //CHECK: [[r2:%.+]] = vector.insert_strided_slice [[m2]], [[r1]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: gpu.return [[r2]] : vector<8x32x2xf16>
+
+ gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
+ %b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>
+ %e = math.exp %b {layout_result_0 = #b} : vector<8x32x2xf16>
+ gpu.return %e : vector<8x32x2xf16>
+ }
+}
>From e1b9d1ebbd33c600b2f100fb7418d93523dbee92 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 23 Jul 2025 11:06:08 -0500
Subject: [PATCH 18/18] Update
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
Co-authored-by: Adam Siemieniuk <adam.siemieniuk at intel.com>
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 519d48382b8e1..64428e01550b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -414,8 +414,8 @@ struct WgToSgElementwiseOp : public ConversionPattern {
// is lowered to:
// #a = #xegpu.layout<inst_data = [16, 16]>
// #b = #xegpu.layout<inst_data = [8, 16]>
-// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, metrix_desc<32x64xf32>
-// %d = load_matrix %slm <{layout_result_0 = #a}> : metrix_desc<32x64xf32> -> vector<16x32xf32>
+// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, matrix_desc<32x64xf32>
+// %d = load_matrix %slm <{layout_result_0 = #a}> : matrix_desc<32x64xf32> -> vector<16x32xf32>
// xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32>
// clang-format on
struct WgToSgConvertLayoutOp
More information about the Mlir-commits
mailing list