[Mlir-commits] [mlir] 317dae1 - [mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp (#146176)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Jul 23 09:35:44 PDT 2025
Author: Chao Chen
Date: 2025-07-23T11:35:40-05:00
New Revision: 317dae1a7e5fb81038177d8c58ee1e376d50ea5c
URL: https://github.com/llvm/llvm-project/commit/317dae1a7e5fb81038177d8c58ee1e376d50ea5c
DIFF: https://github.com/llvm/llvm-project/commit/317dae1a7e5fb81038177d8c58ee1e376d50ea5c.diff
LOG: [mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp (#146176)
This PR adds initial skeleton implementation for lowering
ConvertLayoutOp. It currently only supports cases where SLM is not
needed.
---------
Co-authored-by: Adam Siemieniuk <adam.siemieniuk at intel.com>
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/invalid.mlir
mlir/test/Dialect/XeGPU/layout.mlir
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index e9f8437d7c102..91d6b2a5ead9b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1010,21 +1010,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
let summary = "Convert the layout of the input operand";
let description = [{
- `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
- the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
- as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
- lowered to WI level because that is the end result of all distributions.
+ `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to
+ the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
+ scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
+ the IR is lowered to WI level because that is the end result of all distributions.
}];
- let arguments = (ins XeGPU_Vector2DType: $source,
- XeGPU_LayoutAttr: $srcMap,
- XeGPU_LayoutAttr: $resMap
- );
- let results = (outs XeGPU_Vector2DType: $result);
+ let arguments = (ins XeGPU_VectorType: $source,
+ XeGPU_LayoutAttr: $input_layout,
+ XeGPU_LayoutAttr: $target_layout);
+ let results = (outs XeGPU_VectorType: $result);
let assemblyFormat = [{
- $source attr-dict `:` type($source)
+ $source prop-dict attr-dict `:` type($source)
}];
+ let hasFolder = 1;
let hasVerifier = 1;
+ let hasCanonicalizer = 1;
}
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1f4e817dc549c..20916ae9ef830 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -22,7 +22,7 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
-def XeGPU_Vector2DType: FixedVectorOfRankAndType<[2], [XeGPU_ScalarType]>;
+def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e0046d2c9a37a..704deeaa1f26b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -786,32 +786,55 @@ LogicalResult DpasOp::verify() {
// XeGPU_ConvertLayoutOp
//===----------------------------------------------------------------------===//
LogicalResult ConvertLayoutOp::verify() {
- auto srcMap = getSrcMapAttr();
- auto resMap = getResMapAttr();
- if (!srcMap)
- return emitOpError("expected srcMap.");
- if (!resMap)
- return emitOpError("expected resMap.");
-
- if (srcMap == resMap)
- return emitOpError("expected
diff erent srcMap and resMap.");
-
- // both srcMap and resMap should be WgLayout or SgLayout at the same time.
- if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
- (!srcMap.isSgLayout() || !resMap.isSgLayout()))
- return emitOpError(
- "expected srcMap and resMap be WgLayout or SgLayout at the same time.");
+ auto srcLayout = getInputLayout();
+ auto resLayout = getTargetLayout();
+ if (!srcLayout)
+ return emitOpError("expected input layout.");
+ if (!resLayout)
+ return emitOpError("expected target layout.");
+
+ // both input and target layouts should be WgLayout or SgLayout at the same
+ // time.
+ if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
+ (!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
+ return emitOpError("expected input layout and target layout be WgLayout or "
+ "SgLayout at the same time.");
auto shape = getSource().getType().getShape();
- if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap))
- return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+ if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
+ return emitOpError(
+ "invalid input layout, data cannot be evenly distributed.");
- if (!XeGPUDialect::isEvenlyDistributable(shape, resMap))
- return emitOpError("invalid resMap, data cannot be evenly distributed.");
+ if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
+ return emitOpError(
+ "invalid target layout, data cannot be evenly distributed.");
return mlir::success();
}
+OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
+ if (getInputLayout() == getTargetLayout())
+ return getSource();
+ return {};
+}
+
+struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
+ using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
+ if (op.getInputLayout() == op.getTargetLayout()) {
+ rewriter.replaceOp(op, op.getSource());
+ return success();
+ }
+ return failure();
+ }
+};
+
+void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
+ MLIRContext *context) {
+ patterns.add<FoldConvertLayoutOp>(context);
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 0c55588eda7a3..4656f112958b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -76,6 +76,29 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}
+// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
+// layout attributes. Since both producer and consumer operations handle data
+// partitioning based on their own inst_data, while maintaining original input
+// and output shape, ConvertLayoutOp does not need to manage inst_data.
+struct ConvertLayoutOpPattern
+ : public OpRewritePattern<xegpu::ConvertLayoutOp> {
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
+ PatternRewriter &rewriter) const override {
+ xegpu::LayoutAttr input_layout = op.getInputLayoutAttr();
+ xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr();
+ if (!input_layout.getInstData() || !target_layout.getInstData())
+ return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
+
+ input_layout = input_layout.dropInstData();
+ target_layout = target_layout.dropInstData();
+ auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
+ op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
+ rewriter.replaceOp(op, newOp);
+ return success();
+ }
+};
+
//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
@@ -331,6 +354,7 @@ void XeGPUBlockingPass::runOnOperation() {
});
RewritePatternSet patterns(ctx);
+ patterns.add<ConvertLayoutOpPattern>(ctx);
vector::UnrollVectorOptions vectorOptions;
vectorOptions.setNativeShapeFn(options.nativeShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index b0f5bcaf90864..ef52323a9f46b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
// Calculate offset for each subgroup
- SmallVector<OpFoldResult>
+ static SmallVector<OpFoldResult>
calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
const SmallVector<OpFoldResult> &originalOffsets,
const SmallVector<Value> &localOffset,
const SmallVector<int64_t> &distUnitBaseAddr,
- const SmallVector<int64_t> &distUnitShape) const {
+ const SmallVector<int64_t> &distUnitShape) {
assert(localOffset.size() == distUnitBaseAddr.size() &&
"localOffset and distUnitBaseAddr must have the same rank");
@@ -466,6 +466,75 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};
+// clang-format off
+// Pattern for lowering ConvertLayoutOp based on sg_layout and sg_data.
+// If input_layout and target_layout have identical sg_layout and sg_data,
+// the op is rewritten to a subgroup-level ConvertLayoutOp with these fields
+// dropped. For example:
+// #a = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>
+// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
+// becomes:
+// #a = #xegpu.layout<inst_data = [16, 16]>
+// #b = #xegpu.layout<inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<16x16xf32>
+// (vector<16x16xf32> is determined by sg_data = [16, 16])
+//
+// If sg_layout or sg_data
diff er, SLM is used to redistribute data across subgroups.
+// For example:
+// #a = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 16], inst_data = [16, 16]>
+// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 32], inst_data = [8, 16]>
+// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
+// is lowered to:
+// #a = #xegpu.layout<inst_data = [16, 16]>
+// #b = #xegpu.layout<inst_data = [8, 16]>
+// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, matrix_desc<32x64xf32>
+// %d = load_matrix %slm <{layout_result_0 = #a}> : matrix_desc<32x64xf32> -> vector<16x32xf32>
+// xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32>
+// clang-format on
+struct WgToSgConvertLayoutOp
+ : public OpConversionPattern<xegpu::ConvertLayoutOp> {
+ using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
+ LogicalResult
+ matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::LayoutAttr input = op.getInputLayout();
+ xegpu::LayoutAttr target = op.getTargetLayout();
+
+ if (!input || !target || !input.isWgLayout() || !target.isWgLayout())
+ return rewriter.notifyMatchFailure(
+ op, "Input and target layouts must have subgroup layout");
+
+ DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
+ DenseI32ArrayAttr inputSgData = input.getSgData();
+ DenseI32ArrayAttr inputOrder = input.getOrder();
+ DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
+ DenseI32ArrayAttr targetSgData = target.getSgData();
+ DenseI32ArrayAttr targetOrder = target.getOrder();
+
+ // TODO: currently we only support for optimal case, where input and
+ // output has the same sg_layout and sg_data, so SLM is not involved.
+ if (inputSgLayout != targetSgLayout || inputSgData != targetSgData ||
+ inputOrder != targetOrder)
+ return failure();
+
+ input = input.dropSgLayoutAndData();
+ target = target.dropSgLayoutAndData();
+
+ SmallVector<Value> newOps(adaptor.getSource());
+ if (input && target) {
+ // keep the ConvertLayoutOp for rest fields, e.g., inst_data.
+ for (auto [i, src] : llvm::enumerate(adaptor.getSource())) {
+ auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
+ op.getLoc(), src.getType(), src, input, target);
+ newOps[i] = newOp;
+ }
+ }
+ rewriter.replaceOpWithMultiple(op, {newOps});
+ return success();
+ }
+};
+
// Handles UnrealizedConversionCastOp generated during
// SCFStructuralTypeConversions (step 1). This op may appear as either a
// target or source materialization for Vector values, e.g.:
@@ -550,7 +619,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
UnrealizedConversionCastOpPattern, WgToSgElementwiseOp,
- WgToSgVectorBroadcastOp>(patterns.getContext());
+ WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp>(
+ patterns.getContext());
}
} // namespace xegpu
} // namespace mlir
@@ -662,6 +732,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return isLegal(xegpu::getLayoutAttr(op.getResult()));
});
+ target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
+ [=](xegpu::ConvertLayoutOp op) -> bool {
+ return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
+ });
+
target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
[=](Operation *op) -> std::optional<bool> {
// Only handle elementwise mappable ops
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 66a2f03da71b2..21ba96eaeb0f8 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
+ // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
+ if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
+ return convertOp.getTargetLayoutAttr();
+
// for LoadNdOp, the layout is stored in the tensor descriptor
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
@@ -137,7 +141,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
auto parentOp = arg.getOwner()->getParentOp();
if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
- return getLayoutAttr(tiedInit->get());
+ if (tiedInit)
+ return getLayoutAttr(tiedInit->get());
}
}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 516c2158cb0f8..0160bfee07bf2 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -548,19 +548,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
return
}
-// -----
-func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected
diff erent srcMap and resMap}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
- gpu.return
-}
-
// -----
func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ // expected-error at +1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index 7f3ebec225cdf..017dacc8d629a 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -35,14 +35,18 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}
gpu.func @convert_layout(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ // CHECK: xegpu.convert_layout
+ // CHECK-SAME: <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ // CHECK: xegpu.convert_layout
+ // CHECK-SAME: <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index e820e13f09f64..d986e5bd1cfb4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -500,3 +500,64 @@ gpu.module @test_kernel {
gpu.return
}
}
+
+// -----
+#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
+#c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+
+gpu.module @test_kernel {
+ //CHECK-LABEL: gpu.func @convert_layout
+ //CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32>
+ //CHECK: [[c8:%.+]] = arith.constant 8 : index
+ //CHECK: [[c0:%.+]] = arith.constant 0 : index
+ //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
+ //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
+ //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
+ //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+
+ gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) {
+ %c0 = arith.constant 0 : index
+ %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
+ %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
+ xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
+ gpu.return
+ }
+}
+
+// -----
+
+#lb = #xegpu.layout<inst_data = [8, 32, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
+#b = #xegpu.layout<inst_data = [8, 16, 2], lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>
+
+gpu.module @test_kernel {
+ //CHECK: gpu.func @convert_layout([[arg0:%.+]]: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
+ //CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<8x32x2xf16>
+ //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
+ //CHECK: [[m1:%.+]] = math.exp [[e1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
+ //CHECK: [[r1:%.+]] = vector.insert_strided_slice [[m1]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 16, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16>
+ //CHECK: [[m2:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [8, 1, 2]>} : vector<8x16x2xf16>
+ //CHECK: [[r2:%.+]] = vector.insert_strided_slice [[m2]], [[r1]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16>
+ //CHECK: gpu.return [[r2]] : vector<8x32x2xf16>
+
+ gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
+ %b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>
+ %e = math.exp %b {layout_result_0 = #b} : vector<8x32x2xf16>
+ gpu.return %e : vector<8x32x2xf16>
+ }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 8a880068aab33..d67bdb487d8bf 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -115,7 +115,7 @@ gpu.module @test_round_robin_assignment {
// CHECK-SAME-COUNT-3: {layout_result_0 = #xegpu.layout<lane_layout = [2, 1], lane_data = [1, 1]>}
// CHECK-SAME-COUNT-3: : vector<2x1xf32> to vector<2x4xf32>
// CHECK-NOT: vector.broadcast
- %broadcast = vector.broadcast %load
+ %broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [2, 4], lane_layout = [2, 1], lane_data = [1, 1]>}
: vector<24x1xf32> to vector<24x8xf32>
gpu.return
@@ -215,4 +215,14 @@ gpu.module @test_round_robin_assignment {
xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
gpu.return
}
+
+ gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
+ %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
+ //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
+ //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+ %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
+ gpu.return
+ }
}
More information about the Mlir-commits
mailing list