[Mlir-commits] [mlir] [mlir][xegpu] Add `XeGPUSgToWiDistributeExperimental` pass. (PR #177492)
Charitha Saumya
llvmlistbot at llvm.org
Thu Jan 29 09:45:56 PST 2026
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/177492
>From a258368e1ece17eb3d378aa356feef1530dccc01 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 12 Jan 2026 22:54:17 +0000
Subject: [PATCH 01/18] save work
---
mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3ff7805263f0e..4a213cad69e68 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -114,4 +114,14 @@ def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
"vector::VectorDialect"];
}
+def XeGPUSgToWiDistributeExperimental : Pass<"xegpu-sg-to-wi-distribute-experimental"> {
+ let summary = "Distribute XeGPU ops to work items";
+ let description = [{
+ The pass distributes subgroup level XeGPU ops to work item level XeGPU ops.
+ }];
+ let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+ "vector::VectorDialect", "index::IndexDialect"];
+}
+
+
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
>From 1c0d3d6180ef14ce4e62cffdf28e80f5984cb481 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 14 Jan 2026 17:33:38 +0000
Subject: [PATCH 02/18] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 4 ++
.../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 +
.../XeGPUSgToWiDistributeExperimental.cpp | 59 +++++++++++++++++++
3 files changed, 64 insertions(+)
create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1776a209d0bf1..8f69b9e75f374 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -70,6 +70,10 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU workgroup to subgroup distribution into
/// `patterns`.
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU subgroup to work-item distribution into
+/// `patterns`.
+void populateXeGPUSgToWiDistributeExperimentalPatterns(
+ RewritePatternSet &patterns);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 29b645feab2c6..9bdde30ca5b89 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUBlocking.cpp
XeGPUFoldAliasOps.cpp
+ XeGPUSgToWiDistributeExperimental.cpp
XeGPUSubgroupDistribute.cpp
XeGPUUnroll.cpp
XeGPUWgToSgDistribute.cpp
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
new file mode 100644
index 0000000000000..d1ac34c69d4b5
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -0,0 +1,59 @@
+//===- XeGPUSgToWiDistributeExperimental.cpp - XeGPU SG to WI Pass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUSGTOWIDISTRIBUTEEXPERIMENTAL
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+struct XeGPUSgToWiDistributeExperimentalPass
+ : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
+ XeGPUSgToWiDistributeExperimentalPass> {
+ void runOnOperation() override;
+};
+
+} // namespace
+
+void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+ // Recover layouts.
+ Operation *op = getOperation();
+ if (!xegpu::recoverTemporaryLayouts(op)) {
+ signalPassFailure();
+ return;
+ }
+
+ // Define conversion target
+ ConversionTarget target(getContext());
+ target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
+ vector::VectorDialect>();
+ target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
+ [](Operation *op) { return true; });
+
+ // Define type converter
+ TypeConverter typeConverter;
+ typeConverter.addConversion([](Type type) { return type; });
+}
+
+void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
+ RewritePatternSet &patterns) {
+ // TODO: Implement pattern population logic
+}
>From 9fb97fe24a11c92415a691d6c4a1fbeec0ccc8e5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 14 Jan 2026 22:21:08 +0000
Subject: [PATCH 03/18] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 3 +-
.../XeGPUSgToWiDistributeExperimental.cpp | 27 +++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 20 +++++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 54 +++++++++++++++++++
4 files changed, 101 insertions(+), 3 deletions(-)
create mode 100644 mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index e4c8e2356b191..898af0ec14738 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -10,6 +10,7 @@
#define MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H
#include "mlir/IR/Operation.h"
+#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/LogicalResult.h"
@@ -73,7 +74,7 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU subgroup to work-item distribution into
/// `patterns`.
void populateXeGPUSgToWiDistributeExperimentalPatterns(
- RewritePatternSet &patterns);
+ RewritePatternSet &patterns, TypeConverter &typeConverter);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index d1ac34c69d4b5..2f73b08aa45ac 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -13,6 +13,8 @@
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/LogicalResult.h"
namespace mlir {
namespace xegpu {
@@ -22,9 +24,30 @@ namespace xegpu {
} // namespace mlir
using namespace mlir;
+using namespace mlir::xegpu;
namespace {
+struct CreateNdDescOpPattern
+ : public OpConversionPattern<xegpu::CreateNdDescOp> {
+ using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto resultType = op.getType();
+ // If no layout, nothing to do.
+ if (!resultType.getLayout())
+ return failure();
+
+ auto newOp = xegpu::CreateNdDescOp::create(
+ rewriter, op.getLoc(), resultType.dropLayouts(), op->getOperands(),
+ op->getAttrs());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -54,6 +77,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
}
void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
- RewritePatternSet &patterns) {
- // TODO: Implement pattern population logic
+ RewritePatternSet &patterns, TypeConverter &typeConverter) {
+ patterns.add<CreateNdDescOpPattern>(typeConverter, patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
new file mode 100644
index 0000000000000..2d6e7015e3b39
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -0,0 +1,20 @@
+
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
+// --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+
+gpu.module @xevm_module {
+gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16>
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+
+gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape : [256, 256], strides : [256, 1] : ui64
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+
+}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 1a1520dfa975d..fcae5a3a5dd06 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -247,6 +247,59 @@ struct TestXeGPUSGDistribute
}
};
+struct TestXeGPUSgToWiDistributeExperimental
+ : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
+ OperationPass<gpu::GPUModuleOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+ TestXeGPUSgToWiDistributeExperimental)
+
+ StringRef getArgument() const final {
+ return "test-xegpu-sg-to-wi-distribute-experimental";
+ }
+
+ StringRef getDescription() const final {
+ return "Test the experimental implementation of XeGPU Subgroup to "
+ "Work-item Distribution";
+ }
+
+ void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
+ registry.insert<arith::ArithDialect>();
+ registry.insert<memref::MemRefDialect>();
+ registry.insert<xegpu::XeGPUDialect>();
+ registry.insert<vector::VectorDialect>();
+ registry.insert<index::IndexDialect>();
+ registry.insert<gpu::GPUDialect>();
+ }
+
+ TestXeGPUSgToWiDistributeExperimental() = default;
+ TestXeGPUSgToWiDistributeExperimental(
+ const TestXeGPUSgToWiDistributeExperimental &pass) = default;
+
+ void runOnOperation() override {
+ MLIRContext *ctx = &getContext();
+
+ TypeConverter typeConverter;
+ // After distribution, there are no layouts associated with the tensor_desc
+ // types.
+ typeConverter.addConversion(
+ [](xegpu::TensorDescType type) { return type.dropLayouts(); });
+ typeConverter.addConversion([](Type type) { return type; });
+
+ ConversionTarget target(*ctx);
+ // CreateNdDescOp is legal only if its result type has no layout attribute.
+ target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+ [&](xegpu::CreateNdDescOp op) {
+ return !op.getType().getLayoutAttr();
+ });
+ RewritePatternSet patterns(ctx);
+ xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
+ typeConverter);
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+ (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+ }
+};
+
struct TestXeGPUMoveFuncBodyToWarpOp
: public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp,
OperationPass<gpu::GPUModuleOp>> {
@@ -341,6 +394,7 @@ void registerTestXeGPULowerings() {
PassRegistration<TestXeGPUUnrollingPatterns>();
PassRegistration<TestXeGPULayoutInterface>();
PassRegistration<TestXeGPUSGDistribute>();
+ PassRegistration<TestXeGPUSgToWiDistributeExperimental>();
PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>();
}
} // namespace test
>From 2f818376693291202f89a3e4ae36042c9d3173bb Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 15 Jan 2026 21:12:55 +0000
Subject: [PATCH 04/18] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 69 ++++++++++++++-----
.../XeGPU/sg-to-wi-experimental-unit.mlir | 12 +++-
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 53 ++++++++++++--
3 files changed, 109 insertions(+), 25 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 2f73b08aa45ac..25df0f341093b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -24,7 +24,6 @@ namespace xegpu {
} // namespace mlir
using namespace mlir;
-using namespace mlir::xegpu;
namespace {
@@ -35,19 +34,50 @@ struct CreateNdDescOpPattern
LogicalResult
matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto resultType = op.getType();
+ xegpu::TensorDescType resultType = op.getType();
// If no layout, nothing to do.
if (!resultType.getLayout())
return failure();
auto newOp = xegpu::CreateNdDescOp::create(
- rewriter, op.getLoc(), resultType.dropLayouts(), op->getOperands(),
+ rewriter, op.getLoc(), resultType.dropLayouts(), op.getOperands(),
op->getAttrs());
rewriter.replaceOp(op, newOp.getResult());
return success();
}
};
+struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
+ using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::LoadNdOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+ // If no layout, nothing to do.
+ if (!layout)
+ return failure();
+ // Check if the layout attached to the tensor descriptor is same as the
+ // anchor layout. Otherwise, this is a conflict.
+ if (op.getTensorDescType().getLayout() != layout)
+ return rewriter.notifyMatchFailure(
+ op, "conflicting layout attributes on tensor descriptor and anchor");
+ auto distributedVectorTypeOrFailure =
+ xegpu::getDistributedVectorType(op.getTensorDescType());
+ if (failed(distributedVectorTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute distributed vector type from the layout");
+ llvm::errs() << "adaptor tensor desc: " << adaptor.getTensorDesc() << "\n";
+ auto newOp = xegpu::LoadNdOp::create(
+ rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
+ adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
+ op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
+ op.getL3HintAttr(), /**layout**/ nullptr);
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -57,26 +87,27 @@ struct XeGPUSgToWiDistributeExperimentalPass
} // namespace
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
- // Recover layouts.
- Operation *op = getOperation();
- if (!xegpu::recoverTemporaryLayouts(op)) {
- signalPassFailure();
- return;
- }
+ // // Recover layouts.
+ // Operation *op = getOperation();
+ // if (!xegpu::recoverTemporaryLayouts(op)) {
+ // signalPassFailure();
+ // return;
+ // }
- // Define conversion target
- ConversionTarget target(getContext());
- target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
- vector::VectorDialect>();
- target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
- [](Operation *op) { return true; });
+ // // Define conversion target
+ // ConversionTarget target(getContext());
+ // target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
+ // vector::VectorDialect>();
+ // target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
+ // [](Operation *op) { return true; });
- // Define type converter
- TypeConverter typeConverter;
- typeConverter.addConversion([](Type type) { return type; });
+ // // Define type converter
+ // TypeConverter typeConverter;
+ // typeConverter.addConversion([](Type type) { return type; });
}
void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
RewritePatternSet &patterns, TypeConverter &typeConverter) {
- patterns.add<CreateNdDescOpPattern>(typeConverter, patterns.getContext());
+ patterns.add<CreateNdDescOpPattern, LoadNdOpPattern>(typeConverter,
+ patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 2d6e7015e3b39..df21e4a05bfca 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -5,16 +5,24 @@
gpu.module @xevm_module {
gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape : [256, 256], strides : [256, 1] : ui64
+ %0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
+gpu.func @load_nd() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ gpu.return
+}
+
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index fcae5a3a5dd06..093e37153cd74 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -12,10 +12,13 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <optional>
using namespace mlir;
using namespace mlir::xegpu;
@@ -279,11 +282,45 @@ struct TestXeGPUSgToWiDistributeExperimental
MLIRContext *ctx = &getContext();
TypeConverter typeConverter;
- // After distribution, there are no layouts associated with the tensor_desc
- // types.
- typeConverter.addConversion(
- [](xegpu::TensorDescType type) { return type.dropLayouts(); });
typeConverter.addConversion([](Type type) { return type; });
+ typeConverter.addConversion([](TensorDescType type) -> Type {
+ if (type.getLayoutAttr()) {
+ return type.dropLayouts();
+ }
+ return type;
+ });
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+ .getResult(0);
+ };
+ // Define a vector materialization cast. If the input and output have same
+ // number of elements, perform a shape cast. Otherwise, use
+ // UnrealizedConversionCastOp to handle the conversion.
+ auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
+ ValueRange inputs,
+ Location loc) -> Value {
+ if (inputs.size() != 1)
+ return {};
+ auto input = inputs.front();
+ auto inputVecTy = dyn_cast<VectorType>(input.getType());
+ auto targetVecTy = dyn_cast<VectorType>(type);
+ if (inputVecTy && targetVecTy) {
+ if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
+ return vector::ShapeCastOp::create(builder, loc, targetVecTy, input)
+ .getResult();
+ }
+ return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+ .getResult(0);
+ }
+
+ return {};
+ };
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
+ typeConverter.addSourceMaterialization(vectorMaterializationCast);
+ typeConverter.addTargetMaterialization(vectorMaterializationCast);
ConversionTarget target(*ctx);
// CreateNdDescOp is legal only if its result type has no layout attribute.
@@ -291,6 +328,14 @@ struct TestXeGPUSgToWiDistributeExperimental
[&](xegpu::CreateNdDescOp op) {
return !op.getType().getLayoutAttr();
});
+ // Any anchor XeGPU op is legal only if it has no anchor layout.
+ target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
+ auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
+ if (!anchorOp)
+ return true;
+ return !anchorOp.getAnchorLayout();
+ });
+
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
typeConverter);
>From 8784a2628f417fac71f19386ad6f2926055caa99 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 15 Jan 2026 23:23:40 +0000
Subject: [PATCH 05/18] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 39 ++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 11 ++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 104 ++++++++++++++----
3 files changed, 127 insertions(+), 27 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 25df0f341093b..31e8b070b0e8b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -15,6 +15,7 @@
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
namespace mlir {
namespace xegpu {
@@ -67,7 +68,6 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
if (failed(distributedVectorTypeOrFailure))
return rewriter.notifyMatchFailure(
op, "unable to compute distributed vector type from the layout");
- llvm::errs() << "adaptor tensor desc: " << adaptor.getTensorDesc() << "\n";
auto newOp = xegpu::LoadNdOp::create(
rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
@@ -78,6 +78,39 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
}
};
+struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
+ using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::StoreNdOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+ // If no layout, nothing to do.
+ if (!layout)
+ return failure();
+ // Check if the layout attached to the tensor descriptor and value layout is
+ // same as the anchor layout. Otherwise, this is a conflict.
+ if (op.getTensorDescType().getLayout() != layout)
+ return rewriter.notifyMatchFailure(
+ op, "conflicting layout attributes on tensor descriptor and anchor");
+ auto valueLayout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));
+ if (valueLayout != layout)
+ return rewriter.notifyMatchFailure(
+ op, "conflicting layout attributes on value and anchor");
+ auto distributedVectorTypeOrFailure =
+ xegpu::getDistributedVectorType(op.getTensorDescType());
+ if (failed(distributedVectorTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute distributed vector type from the layout");
+ xegpu::StoreNdOp::create(rewriter, op.getLoc(), adaptor.getValue(),
+ adaptor.getTensorDesc(), op.getMixedOffsets(),
+ op.getL1HintAttr(), op.getL2HintAttr(),
+ op.getL3HintAttr(), /**layout**/ nullptr);
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -108,6 +141,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
RewritePatternSet &patterns, TypeConverter &typeConverter) {
- patterns.add<CreateNdDescOpPattern, LoadNdOpPattern>(typeConverter,
- patterns.getContext());
+ patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern>(
+ typeConverter, patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index df21e4a05bfca..a8a654f3b4759 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -25,4 +25,15 @@ gpu.func @load_nd() {
gpu.return
}
+gpu.func @store_nd() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ xegpu.store_nd %2, %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 093e37153cd74..6a6b5f6c6a856 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -250,6 +250,35 @@ struct TestXeGPUSGDistribute
}
};
+static FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
+ VectorType originalType) {
+ if (!layout)
+ return failure();
+ assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+ "Expecting a valid layout.");
+ SmallVector<int64_t> effectiveLaneLayout =
+ layout.getEffectiveLaneLayoutAsInt();
+ assert(static_cast<size_t>(originalType.getRank()) >=
+ effectiveLaneLayout.size() &&
+ "Rank of the original vector type should be greater or equal to the "
+ "size of the lane layout to distribute the vector type.");
+ SmallVector<int64_t> distributedShape(originalType.getShape());
+ // Only distribute the last `laneLayout.size()` dimensions. The remaining
+ // dimensions are not distributed.
+ unsigned distributionStart =
+ originalType.getRank() - effectiveLaneLayout.size();
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i < distributionStart)
+ continue;
+ // Check if the dimension can be distributed evenly.
+ if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+ }
+ return VectorType::get(distributedShape, originalType.getElementType());
+}
+
struct TestXeGPUSgToWiDistributeExperimental
: public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
OperationPass<gpu::GPUModuleOp>> {
@@ -289,38 +318,65 @@ struct TestXeGPUSgToWiDistributeExperimental
}
return type;
});
+ typeConverter.addConversion([](Value v) -> std::optional<Type> {
+ auto type = v.getType();
+ auto layout = xegpu::getDistributeLayoutAttr(v);
+ // If no valid layout, nothing to do.
+ if (!layout || !layout.isForSubgroup())
+ return std::nullopt;
+ Operation *op = v.getDefiningOp();
+ if (isa<LoadNdOp>(op)) {
+ auto loadNdOp = cast<LoadNdOp>(op);
+ layout = loadNdOp.getAnchorLayout();
+ auto newTyOrFailure =
+ getDistributedVectorType(loadNdOp.getTensorDescType());
+ if (succeeded(newTyOrFailure))
+ return *newTyOrFailure;
+ return std::nullopt;
+ }
+ // For other vector types, distribute based on the lane layout.
+ if (isa<VectorType>(type)) {
+ auto newTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+ if (succeeded(newTyOrFailure))
+ return *newTyOrFailure;
+ }
+ return std::nullopt;
+ });
auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
mlir::ValueRange inputs,
mlir::Location loc) -> mlir::Value {
return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
.getResult(0);
};
- // Define a vector materialization cast. If the input and output have same
- // number of elements, perform a shape cast. Otherwise, use
- // UnrealizedConversionCastOp to handle the conversion.
- auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
- ValueRange inputs,
- Location loc) -> Value {
- if (inputs.size() != 1)
- return {};
- auto input = inputs.front();
- auto inputVecTy = dyn_cast<VectorType>(input.getType());
- auto targetVecTy = dyn_cast<VectorType>(type);
- if (inputVecTy && targetVecTy) {
- if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
- return vector::ShapeCastOp::create(builder, loc, targetVecTy, input)
- .getResult();
- }
- return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
- .getResult(0);
- }
-
- return {};
- };
+ // // Define a vector materialization cast. If the input and output have
+ // same
+ // // number of elements, perform a shape cast. Otherwise, use
+ // // UnrealizedConversionCastOp to handle the conversion.
+ // auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
+ // ValueRange inputs,
+ // Location loc) -> Value {
+ // if (inputs.size() != 1)
+ // return {};
+ // auto input = inputs.front();
+ // auto inputVecTy = dyn_cast<VectorType>(input.getType());
+ // auto targetVecTy = dyn_cast<VectorType>(type);
+ // if (inputVecTy && targetVecTy) {
+ // if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
+ // return vector::ShapeCastOp::create(builder, loc, targetVecTy,
+ // input)
+ // .getResult();
+ // }
+ // return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+ // .getResult(0);
+ // }
+
+ // return {};
+ // };
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
- typeConverter.addSourceMaterialization(vectorMaterializationCast);
- typeConverter.addTargetMaterialization(vectorMaterializationCast);
+ // typeConverter.addSourceMaterialization(vectorMaterializationCast);
+ // typeConverter.addTargetMaterialization(vectorMaterializationCast);
ConversionTarget target(*ctx);
// CreateNdDescOp is legal only if its result type has no layout attribute.
>From 47e66a487fa3daa380ca97440402dcb5bcfa27e6 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 16 Jan 2026 23:41:13 +0000
Subject: [PATCH 06/18] save work
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 17 ++
.../XeGPUSgToWiDistributeExperimental.cpp | 185 ++++++++++++++++--
.../Transforms/XeGPUSubgroupDistribute.cpp | 44 +----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 29 +++
.../XeGPU/sg-to-wi-experimental-unit.mlir | 30 +++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 122 ++++++------
6 files changed, 304 insertions(+), 123 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6573343a8bc97..d327a431d6ec4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -63,6 +63,23 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
LayoutAttr layout);
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If
+/// array_length > 1, that is appended to the front of the distributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout,
+ VectorType originalType);
+
/// Extract a set of small vectors from a value with a given shape using
/// vector.extract_stride_slice
SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 31e8b070b0e8b..ed39a1ee39918 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -12,6 +12,9 @@
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/LogicalResult.h"
@@ -28,6 +31,17 @@ using namespace mlir;
namespace {
+static Value resolveTy(ConversionPatternRewriter &rewriter,
+ TypedValue<VectorType> v, VectorType expectedTy) {
+ if (v.getType() == expectedTy)
+ return v;
+ assert(v.getType().getElementType() == expectedTy.getElementType() &&
+ "element types must match");
+ assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
+ "total number of elements must match");
+ return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+}
+
struct CreateNdDescOpPattern
: public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -63,17 +77,24 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
if (op.getTensorDescType().getLayout() != layout)
return rewriter.notifyMatchFailure(
op, "conflicting layout attributes on tensor descriptor and anchor");
- auto distributedVectorTypeOrFailure =
+ auto supportedWiResultTyOrFailure =
xegpu::getDistributedVectorType(op.getTensorDescType());
- if (failed(distributedVectorTypeOrFailure))
+ auto expectedWiResultTyOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());
+ if (failed(supportedWiResultTyOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute distributed vector type from the layout");
+ op, "unable to compute the workitem vector type for LoadNdOp");
+ if (failed(expectedWiResultTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op,
+ "unable to compute expected workitem vector type from lane layout");
auto newOp = xegpu::LoadNdOp::create(
- rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
+ rewriter, op.getLoc(), supportedWiResultTyOrFailure.value(),
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
op.getL3HintAttr(), /**layout**/ nullptr);
- rewriter.replaceOp(op, newOp.getResult());
+ rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
+ expectedWiResultTyOrFailure.value()));
return success();
}
};
@@ -97,20 +118,157 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
if (valueLayout != layout)
return rewriter.notifyMatchFailure(
op, "conflicting layout attributes on value and anchor");
- auto distributedVectorTypeOrFailure =
+ auto supportedWiValueTyOrFailure =
xegpu::getDistributedVectorType(op.getTensorDescType());
- if (failed(distributedVectorTypeOrFailure))
+ if (failed(supportedWiValueTyOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute distributed vector type from the layout");
- xegpu::StoreNdOp::create(rewriter, op.getLoc(), adaptor.getValue(),
- adaptor.getTensorDesc(), op.getMixedOffsets(),
- op.getL1HintAttr(), op.getL2HintAttr(),
- op.getL3HintAttr(), /**layout**/ nullptr);
+ op,
+ "unable to compute wi vector type for StoreNdOp value from tensor "
+ "descriptor");
+
+ xegpu::StoreNdOp::create(
+ rewriter, op.getLoc(),
+ resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
+ supportedWiValueTyOrFailure.value()),
+ adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
+ op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
rewriter.eraseOp(op);
return success();
}
};
+struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
+ using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Check if the op has A, B and CD layouts attached.
+ auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());
+ auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());
+ auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
+ if (!layoutA || !layoutB || !layoutCd)
+ return failure();
+
+ auto wiResultTyOrFailure =
+ xegpu::getDistributedVectorType(op.getType(), layoutCd);
+ auto wiATypeOrFailure =
+ xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);
+ auto wiBTypeOrFailure =
+ xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);
+ auto expectedWiResultTyOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());
+ if (failed(wiResultTyOrFailure) || failed(wiATypeOrFailure) ||
+ failed(wiBTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "failed to calculate supported workitem vector types for DpasOp "
+ "from layouts");
+ if (failed(expectedWiResultTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute expected workitem vector type for DpasOp from "
+ "lane layout");
+ auto newOp = xegpu::DpasOp::create(
+ rewriter, op->getLoc(), wiResultTyOrFailure.value(),
+ resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
+ wiATypeOrFailure.value()),
+ resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
+ wiBTypeOrFailure.value()),
+ resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
+ wiResultTyOrFailure.value()),
+ /** layoutA**/ nullptr,
+ /** layoutB**/ nullptr, /** layoutCd**/ nullptr);
+ // Explicitly set the new types to enable correct type materializations.
+ rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
+ expectedWiResultTyOrFailure.value()));
+ return success();
+ }
+};
+
+struct ElementWiseOpPattern : public ConversionPattern {
+ ElementWiseOpPattern(TypeConverter &typeConverter, MLIRContext *ctx)
+ : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+ LogicalResult
+ matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override {
+ // Only match ops with elementwise trait and single result.
+ if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)
+ return failure();
+
+ auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
+ if (!resultType)
+ return rewriter.notifyMatchFailure(
+ op, "operation result is not a vector type");
+
+ xegpu::DistributeLayoutAttr layout =
+ xegpu::getTemporaryLayout(llvm::cast<OpResult>(op->getResult(0)));
+ if (!layout || !layout.isForSubgroup())
+ return rewriter.notifyMatchFailure(
+ op, "operation result does not have subgroup distribute layout");
+
+ auto wiShapeOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
+
+ if (failed(wiShapeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute workitem vector type from the layout");
+
+ VectorType newResultType = wiShapeOrFailure.value();
+ OperationState state(op->getLoc(), op->getName());
+ state.addOperands(operands);
+ state.addTypes(newResultType);
+ // Copy all attributes except for DistributeLayoutAttr.
+ for (auto attr : op->getAttrs()) {
+ if (!isa<xegpu::DistributeLayoutAttr>(attr.getValue()))
+ state.addAttribute(attr.getName(), attr.getValue());
+ }
+ Operation *newOp = rewriter.create(state);
+
+ rewriter.replaceOp(op, newOp->getResult(0));
+ return success();
+ }
+};
+
+struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
+ using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto resultType = dyn_cast<VectorType>(op.getType());
+ if (!resultType)
+ return failure();
+
+ // Only handle dense vector constants
+ auto dense = dyn_cast<SplatElementsAttr>(op.getValue());
+ if (!dense)
+ return rewriter.notifyMatchFailure(
+ op, "only dense splat vector constants are supported");
+
+ xegpu::DistributeLayoutAttr layout =
+ xegpu::getTemporaryLayout(llvm::cast<OpResult>(op.getResult()));
+ if (!layout || !layout.isForSubgroup())
+ return rewriter.notifyMatchFailure(
+ op, "operation result does not have subgroup distribute layout");
+
+ auto wiShapeOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
+
+ if (failed(wiShapeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute workitem vector type from the layout");
+
+ VectorType newResultType = wiShapeOrFailure.value();
+ auto sclarValue = dense.getSplatValue<Attribute>();
+ auto newDenseAttr = DenseElementsAttr::get(newResultType, sclarValue);
+
+ auto newOp = arith::ConstantOp::create(rewriter, op.getLoc(), newResultType,
+ newDenseAttr);
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -141,6 +299,7 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
RewritePatternSet &patterns, TypeConverter &typeConverter) {
- patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern>(
+ patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
+ DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
typeConverter, patterns.getContext());
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9113f00ac39f0..8beadeb5da309 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -65,48 +65,6 @@ namespace {
static constexpr unsigned regularPatternBenefit = 1;
static constexpr unsigned highPatternBenefit = 2;
-/// Helper function to get distributed vector type for a source vector type
-/// according to the lane_layout. We simply divide each dimension of tensor
-/// descriptor shape by corresponding lane_layout dimension. If
-/// array_length > 1, that is appended to the front of the ditributed shape.
-/// NOTE: This is the vector type that will be returned by the
-/// gpu.warp_execute_on_lane0 op.
-///
-/// Examples:
-/// | original vector shape | lane_layout | distributed vector shape |
-/// |-----------------------|-------------|--------------------------|
-/// | 32x16 | [1, 16] | 32x1 |
-/// | 32x16 | [2, 8] | 16x2 |
-/// | 2x32x16 | [1, 16] | 2x32x1 |
-static FailureOr<VectorType>
-getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
- VectorType originalType) {
- if (!layout)
- return failure();
- assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
- "Expecting a valid layout.");
- SmallVector<int64_t> effectiveLaneLayout =
- layout.getEffectiveLaneLayoutAsInt();
- assert(static_cast<size_t>(originalType.getRank()) >=
- effectiveLaneLayout.size() &&
- "Rank of the original vector type should be greater or equal to the "
- "size of the lane layout to distribute the vector type.");
- SmallVector<int64_t> distributedShape(originalType.getShape());
- // Only distribute the last `laneLayout.size()` dimensions. The remaining
- // dimensions are not distributed.
- unsigned distributionStart =
- originalType.getRank() - effectiveLaneLayout.size();
- for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
- if (i < distributionStart)
- continue;
- // Check if the dimension can be distributed evenly.
- if (dim % effectiveLaneLayout[i - distributionStart] != 0)
- return failure();
- distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
- }
- return VectorType::get(distributedShape, originalType.getElementType());
-}
-
/// Helper function to resolve types if the distributed type out of
/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
/// Example 1:
@@ -409,7 +367,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
storeOp, "the source tensor descriptor lacks layout attribute");
FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
+ xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
if (failed(distributedTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 51783b41c4c96..addec519c405e 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -101,6 +101,35 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
return xegpu::getDistributedVectorType(helperTdescTy);
}
+FailureOr<VectorType>
+xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
+ VectorType originalType) {
+ if (!layout)
+ return failure();
+ assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+ "Expecting a valid layout.");
+ SmallVector<int64_t> effectiveLaneLayout =
+ layout.getEffectiveLaneLayoutAsInt();
+ assert(static_cast<size_t>(originalType.getRank()) >=
+ effectiveLaneLayout.size() &&
+ "Rank of the original vector type should be greater or equal to the "
+ "size of the lane layout to distribute the vector type.");
+ SmallVector<int64_t> distributedShape(originalType.getShape());
+ // Only distribute the last `laneLayout.size()` dimensions. The remaining
+ // dimensions are not distributed.
+ unsigned distributionStart =
+ originalType.getRank() - effectiveLaneLayout.size();
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i < distributionStart)
+ continue;
+ // Check if the dimension can be distributed evenly.
+ if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+ }
+ return VectorType::get(distributedShape, originalType.getElementType());
+}
+
std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
const StringRef prefix("layout_operand_");
unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a8a654f3b4759..c89586f450164 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -36,4 +36,34 @@ gpu.func @store_nd() {
gpu.return
}
+gpu.func @dpas_op() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ %5 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<0.0> : vector<8x16xf32>
+ %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+ %4 = xegpu.dpas %2, %3, %5
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ gpu.return
+}
+
+gpu.func @elementwise_op() {
+ %c0 = arith.constant 0 : index
+ %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<1.0> : vector<16x16xf32>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %2 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+ %3 = arith.addf %0, %2
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16x16xf32>
+ gpu.return
+}
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 6a6b5f6c6a856..b3cd4ebdc71d0 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -6,8 +6,10 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -18,6 +20,7 @@
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/raw_ostream.h"
#include <optional>
using namespace mlir;
@@ -250,35 +253,6 @@ struct TestXeGPUSGDistribute
}
};
-static FailureOr<VectorType>
-getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
- VectorType originalType) {
- if (!layout)
- return failure();
- assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
- "Expecting a valid layout.");
- SmallVector<int64_t> effectiveLaneLayout =
- layout.getEffectiveLaneLayoutAsInt();
- assert(static_cast<size_t>(originalType.getRank()) >=
- effectiveLaneLayout.size() &&
- "Rank of the original vector type should be greater or equal to the "
- "size of the lane layout to distribute the vector type.");
- SmallVector<int64_t> distributedShape(originalType.getShape());
- // Only distribute the last `laneLayout.size()` dimensions. The remaining
- // dimensions are not distributed.
- unsigned distributionStart =
- originalType.getRank() - effectiveLaneLayout.size();
- for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
- if (i < distributionStart)
- continue;
- // Check if the dimension can be distributed evenly.
- if (dim % effectiveLaneLayout[i - distributionStart] != 0)
- return failure();
- distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
- }
- return VectorType::get(distributedShape, originalType.getElementType());
-}
-
struct TestXeGPUSgToWiDistributeExperimental
: public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
OperationPass<gpu::GPUModuleOp>> {
@@ -311,7 +285,12 @@ struct TestXeGPUSgToWiDistributeExperimental
MLIRContext *ctx = &getContext();
TypeConverter typeConverter;
- typeConverter.addConversion([](Type type) { return type; });
+ typeConverter.addConversion([](Type type) -> std::optional<Type> {
+ // non tensor_desc and vector types are legal as is.
+ if (!isa<TensorDescType, VectorType>(type))
+ return type;
+ return std::nullopt;
+ });
typeConverter.addConversion([](TensorDescType type) -> Type {
if (type.getLayoutAttr()) {
return type.dropLayouts();
@@ -324,16 +303,16 @@ struct TestXeGPUSgToWiDistributeExperimental
// If no valid layout, nothing to do.
if (!layout || !layout.isForSubgroup())
return std::nullopt;
- Operation *op = v.getDefiningOp();
- if (isa<LoadNdOp>(op)) {
- auto loadNdOp = cast<LoadNdOp>(op);
- layout = loadNdOp.getAnchorLayout();
- auto newTyOrFailure =
- getDistributedVectorType(loadNdOp.getTensorDescType());
- if (succeeded(newTyOrFailure))
- return *newTyOrFailure;
- return std::nullopt;
- }
+ // Operation *op = v.getDefiningOp();
+ // if (isa<LoadNdOp>(op)) {
+ // auto loadNdOp = cast<LoadNdOp>(op);
+ // layout = loadNdOp.getAnchorLayout();
+ // auto newTyOrFailure =
+ // getDistributedVectorType(loadNdOp.getTensorDescType());
+ // if (succeeded(newTyOrFailure))
+ // return *newTyOrFailure;
+ // return std::nullopt;
+ // }
// For other vector types, distribute based on the lane layout.
if (isa<VectorType>(type)) {
auto newTyOrFailure =
@@ -349,34 +328,8 @@ struct TestXeGPUSgToWiDistributeExperimental
return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
.getResult(0);
};
- // // Define a vector materialization cast. If the input and output have
- // same
- // // number of elements, perform a shape cast. Otherwise, use
- // // UnrealizedConversionCastOp to handle the conversion.
- // auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
- // ValueRange inputs,
- // Location loc) -> Value {
- // if (inputs.size() != 1)
- // return {};
- // auto input = inputs.front();
- // auto inputVecTy = dyn_cast<VectorType>(input.getType());
- // auto targetVecTy = dyn_cast<VectorType>(type);
- // if (inputVecTy && targetVecTy) {
- // if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
- // return vector::ShapeCastOp::create(builder, loc, targetVecTy,
- // input)
- // .getResult();
- // }
- // return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
- // .getResult(0);
- // }
-
- // return {};
- // };
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
- // typeConverter.addSourceMaterialization(vectorMaterializationCast);
- // typeConverter.addTargetMaterialization(vectorMaterializationCast);
ConversionTarget target(*ctx);
// CreateNdDescOp is legal only if its result type has no layout attribute.
@@ -391,11 +344,46 @@ struct TestXeGPUSgToWiDistributeExperimental
return true;
return !anchorOp.getAnchorLayout();
});
+ target.addDynamicallyLegalOp<arith::ConstantOp>(
+ [=](arith::ConstantOp op) -> bool {
+ // If the result type is not a vector, it's legal.
+ if (!isa<VectorType>(op.getResult().getType()))
+ return true;
+ // For vector result types, check if it has a layout attribute.
+ return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
+ });
+ // In math and arith dialects, only handle elementwise ops with a single
+ // result and with a result layout attribute.
+ target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+ [=](Operation *op) -> std::optional<bool> {
+ // Only handle elementwise mappable ops
+ if (!OpTrait::hasElementwiseMappableTraits(op))
+ return true;
+ // Only handle ops with single vector result
+ if (op->getNumResults() != 1)
+ return true;
+
+ VectorType resultType =
+ dyn_cast<VectorType>(op->getResult(0).getType());
+ if (!resultType)
+ return true;
+
+ // Check if all operands are vectors of the same shape
+ for (Value operand : op->getOperands()) {
+ VectorType operandType = dyn_cast<VectorType>(operand.getType());
+ if (!operandType ||
+ operandType.getShape() != resultType.getShape()) {
+ return true;
+ }
+ }
+ return !xegpu::getTemporaryLayout(
+ dyn_cast<OpResult>(op->getResult(0)));
+ });
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
typeConverter);
- target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
(void)applyPartialConversion(getOperation(), target, std::move(patterns));
}
>From 5708cc4c0c00e411beb4478c606fea59b4fb4a7d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 20 Jan 2026 19:37:42 +0000
Subject: [PATCH 07/18] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 9 +-
.../XeGPUSgToWiDistributeExperimental.cpp | 144 +++++++++++++++---
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 101 +-----------
3 files changed, 131 insertions(+), 123 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 898af0ec14738..1930ef8e454d4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -71,10 +71,11 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU workgroup to subgroup distribution into
/// `patterns`.
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
-/// Appends patterns for XeGPU subgroup to work-item distribution into
-/// `patterns`.
-void populateXeGPUSgToWiDistributeExperimentalPatterns(
- RewritePatternSet &patterns, TypeConverter &typeConverter);
+/// Defines type conversions and legality for XeGPU subgroup to workitem
+/// distribution and appends the required conversion patterns into `patterns`.
+void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ TypeConverter &typeConverter, RewritePatternSet &patterns,
+ ConversionTarget &target);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
/// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index ed39a1ee39918..0fc022241095b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -14,6 +15,7 @@
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/GraphWriter.h"
@@ -42,6 +44,33 @@ static Value resolveTy(ConversionPatternRewriter &rewriter,
return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
}
+static LogicalResult verifyLayouts(Operation *root) {
+ auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
+ auto layout = anchorOp.getAnchorLayout();
+ if (!layout) {
+ nestedOp->emitError("expected anchor layout attribute on operation");
+ return WalkResult::interrupt();
+ }
+ return WalkResult::advance();
+ }
+ // For each vector result, check if the op contains a result layout
+ // attribute.
+ for (OpResult result : nestedOp->getResults()) {
+ if (isa<VectorType>(result.getType())) {
+ auto layout = xegpu::getDistributeLayoutAttr(result);
+ if (!layout) {
+ nestedOp->emitError(
+ "expected result layout attribute on vector result");
+ return WalkResult::interrupt();
+ }
+ }
+ }
+ return WalkResult::advance();
+ });
+ return walkResult.wasInterrupted() ? failure() : success();
+}
+
struct CreateNdDescOpPattern
: public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -278,27 +307,102 @@ struct XeGPUSgToWiDistributeExperimentalPass
} // namespace
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
- // // Recover layouts.
- // Operation *op = getOperation();
- // if (!xegpu::recoverTemporaryLayouts(op)) {
- // signalPassFailure();
- // return;
- // }
-
- // // Define conversion target
- // ConversionTarget target(getContext());
- // target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
- // vector::VectorDialect>();
- // target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
- // [](Operation *op) { return true; });
-
- // // Define type converter
- // TypeConverter typeConverter;
- // typeConverter.addConversion([](Type type) { return type; });
+ // Verify if all XeGPU and vector operations have layouts.
+ Operation *root = getOperation();
+ if (failed(verifyLayouts(root))) {
+ signalPassFailure();
+ return;
+ }
}
-
-void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
- RewritePatternSet &patterns, TypeConverter &typeConverter) {
+void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ TypeConverter &typeConverter, RewritePatternSet &patterns,
+ ConversionTarget &target) {
+
+ // Populate type conversions.
+ // - Any type other than TensorDescType and VectorType are legal as is.
+ typeConverter.addConversion([](Type type) -> std::optional<Type> {
+ if (!isa<TensorDescType, VectorType>(type))
+ return type;
+ return std::nullopt;
+ });
+ // For TensorDescType, drop the layout attribute if any.
+ typeConverter.addConversion([](TensorDescType type) -> Type {
+ if (type.getLayoutAttr()) {
+ return type.dropLayouts();
+ }
+ return type;
+ });
+ // - For VectorType, check if there is a distribute layout attribute on the
+ // value. If so, convert to the distributed vector type based on the layout.
+ typeConverter.addConversion([](Value v) -> std::optional<Type> {
+ auto type = v.getType();
+ auto layout = xegpu::getDistributeLayoutAttr(v);
+ // If no valid layout, nothing to do.
+ if (!layout || !layout.isForSubgroup())
+ return std::nullopt;
+ // Vector type is distributed based on lane layout.
+ if (isa<VectorType>(type)) {
+ auto newTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+ if (succeeded(newTyOrFailure))
+ return *newTyOrFailure;
+ }
+ return std::nullopt;
+ });
+ // - Materialization casts are only used for testing purposes.
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+ .getResult(0);
+ };
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
+ // Define legality.
+ // - CreateNdDescOp is legal only if its result type has no layout attribute.
+ target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+ [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
+ // - Any anchor XeGPU op is legal only if it has no anchor layout.
+ target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
+ auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
+ if (!anchorOp)
+ return true;
+ return !anchorOp.getAnchorLayout();
+ });
+ target.addDynamicallyLegalOp<arith::ConstantOp>(
+ [=](arith::ConstantOp op) -> bool {
+ // If the result type is not a vector, it's legal.
+ if (!isa<VectorType>(op.getResult().getType()))
+ return true;
+ // For vector result types, check if it has a layout attribute.
+ return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
+ });
+ // - In math and arith dialects, only handle elementwise ops with a single
+ // result and with a result layout attribute.
+ target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+ [=](Operation *op) -> std::optional<bool> {
+ // Only handle elementwise mappable ops
+ if (!OpTrait::hasElementwiseMappableTraits(op))
+ return true;
+ // Only handle ops with single vector result
+ if (op->getNumResults() != 1)
+ return true;
+
+ VectorType resultType =
+ dyn_cast<VectorType>(op->getResult(0).getType());
+ if (!resultType)
+ return true;
+
+ // Check if all operands are vectors of the same shape
+ for (Value operand : op->getOperands()) {
+ VectorType operandType = dyn_cast<VectorType>(operand.getType());
+ if (!operandType || operandType.getShape() != resultType.getShape()) {
+ return true;
+ }
+ }
+ return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
+ });
+ target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
typeConverter, patterns.getContext());
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index b3cd4ebdc71d0..c28ebf1f8bede 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -283,108 +283,11 @@ struct TestXeGPUSgToWiDistributeExperimental
void runOnOperation() override {
MLIRContext *ctx = &getContext();
-
TypeConverter typeConverter;
- typeConverter.addConversion([](Type type) -> std::optional<Type> {
- // non tensor_desc and vector types are legal as is.
- if (!isa<TensorDescType, VectorType>(type))
- return type;
- return std::nullopt;
- });
- typeConverter.addConversion([](TensorDescType type) -> Type {
- if (type.getLayoutAttr()) {
- return type.dropLayouts();
- }
- return type;
- });
- typeConverter.addConversion([](Value v) -> std::optional<Type> {
- auto type = v.getType();
- auto layout = xegpu::getDistributeLayoutAttr(v);
- // If no valid layout, nothing to do.
- if (!layout || !layout.isForSubgroup())
- return std::nullopt;
- // Operation *op = v.getDefiningOp();
- // if (isa<LoadNdOp>(op)) {
- // auto loadNdOp = cast<LoadNdOp>(op);
- // layout = loadNdOp.getAnchorLayout();
- // auto newTyOrFailure =
- // getDistributedVectorType(loadNdOp.getTensorDescType());
- // if (succeeded(newTyOrFailure))
- // return *newTyOrFailure;
- // return std::nullopt;
- // }
- // For other vector types, distribute based on the lane layout.
- if (isa<VectorType>(type)) {
- auto newTyOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
- if (succeeded(newTyOrFailure))
- return *newTyOrFailure;
- }
- return std::nullopt;
- });
- auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
- mlir::ValueRange inputs,
- mlir::Location loc) -> mlir::Value {
- return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
- .getResult(0);
- };
- typeConverter.addSourceMaterialization(materializeCast);
- typeConverter.addTargetMaterialization(materializeCast);
-
ConversionTarget target(*ctx);
- // CreateNdDescOp is legal only if its result type has no layout attribute.
- target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
- [&](xegpu::CreateNdDescOp op) {
- return !op.getType().getLayoutAttr();
- });
- // Any anchor XeGPU op is legal only if it has no anchor layout.
- target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
- auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
- if (!anchorOp)
- return true;
- return !anchorOp.getAnchorLayout();
- });
- target.addDynamicallyLegalOp<arith::ConstantOp>(
- [=](arith::ConstantOp op) -> bool {
- // If the result type is not a vector, it's legal.
- if (!isa<VectorType>(op.getResult().getType()))
- return true;
- // For vector result types, check if it has a layout attribute.
- return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
- });
- // In math and arith dialects, only handle elementwise ops with a single
- // result and with a result layout attribute.
- target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
- [=](Operation *op) -> std::optional<bool> {
- // Only handle elementwise mappable ops
- if (!OpTrait::hasElementwiseMappableTraits(op))
- return true;
- // Only handle ops with single vector result
- if (op->getNumResults() != 1)
- return true;
-
- VectorType resultType =
- dyn_cast<VectorType>(op->getResult(0).getType());
- if (!resultType)
- return true;
-
- // Check if all operands are vectors of the same shape
- for (Value operand : op->getOperands()) {
- VectorType operandType = dyn_cast<VectorType>(operand.getType());
- if (!operandType ||
- operandType.getShape() != resultType.getShape()) {
- return true;
- }
- }
- return !xegpu::getTemporaryLayout(
- dyn_cast<OpResult>(op->getResult(0)));
- });
- target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-
RewritePatternSet patterns(ctx);
- xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
- typeConverter);
-
+ xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ typeConverter, patterns, target);
(void)applyPartialConversion(getOperation(), target, std::move(patterns));
}
};
>From 4579c154bbdd8cfc55c5f5e6858b39b53adc8fee Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 06:24:52 +0000
Subject: [PATCH 08/18] save working version
---
.../Dialect/XeGPU/Transforms/Transforms.h | 5 +
.../XeGPUSgToWiDistributeExperimental.cpp | 263 +++++++++++++++---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 3 +-
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 3 +
4 files changed, 240 insertions(+), 34 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1930ef8e454d4..35a5154a2ce59 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -71,8 +71,13 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU workgroup to subgroup distribution into
/// `patterns`.
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
+/// Define only the type conversions needed for XeGPU subgroup to workitem
+/// distribution.
+void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
/// Defines type conversions and legality for XeGPU subgroup to workitem
/// distribution and appends the required conversion patterns into `patterns`.
+/// Appends patterns for XeGPU subgroup to workitem distribution into
+/// `patterns`.
void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 0fc022241095b..b79cf652083de 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -8,19 +8,27 @@
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
+#include <optional>
namespace mlir {
namespace xegpu {
@@ -31,17 +39,30 @@ namespace xegpu {
using namespace mlir;
+#define DEBUG_TYPE "xegpu-sg-to-wi-distribute-experimental"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
namespace {
static Value resolveTy(ConversionPatternRewriter &rewriter,
TypedValue<VectorType> v, VectorType expectedTy) {
+ // llvm::errs() << "value:" << v << " expectedTy: " << expectedTy << "\n";
if (v.getType() == expectedTy)
return v;
- assert(v.getType().getElementType() == expectedTy.getElementType() &&
- "element types must match");
- assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
- "total number of elements must match");
- return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+ // assert(v.getType().getElementType() == expectedTy.getElementType() &&
+ // "element types must match");
+ // assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
+ // "total number of elements must match");
+ // If both types are vector type and number of elements match, insert a shape
+ // cast.
+ if (isa<VectorType>(v.getType()) &&
+ v.getType().getNumElements() == expectedTy.getNumElements())
+ return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+
+ // else create an unrealized cast.
+ auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),
+ expectedTy, ValueRange{v});
+ return newOp.getResult(0);
}
static LogicalResult verifyLayouts(Operation *root) {
@@ -172,13 +193,14 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
LogicalResult
matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ // llvm::errs() << "DpasOpPattern matchAndRewrite called\n";
// Check if the op has A, B and CD layouts attached.
auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());
auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());
auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
if (!layoutA || !layoutB || !layoutCd)
return failure();
-
+ // llvm::errs() << "tryning to calculate wi types for dpas op\n";
auto wiResultTyOrFailure =
xegpu::getDistributedVectorType(op.getType(), layoutCd);
auto wiATypeOrFailure =
@@ -196,6 +218,8 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
return rewriter.notifyMatchFailure(
op, "unable to compute expected workitem vector type for DpasOp from "
"lane layout");
+ // llvm::errs() << "adaptor acc type: " << adaptor.getAcc().getType() <<
+ // "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
auto newOp = xegpu::DpasOp::create(
rewriter, op->getLoc(), wiResultTyOrFailure.value(),
resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
@@ -307,17 +331,194 @@ struct XeGPUSgToWiDistributeExperimentalPass
} // namespace
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
- // Verify if all XeGPU and vector operations have layouts.
+
+ // llvm::errs() << "Running XeGPUSgToWiDistributeExperimentalPass\n";
+ // Verify if all XeGPU anchor ops and vector ops have result layouts.
Operation *root = getOperation();
- if (failed(verifyLayouts(root))) {
- signalPassFailure();
- return;
+ // if (failed(verifyLayouts(root))) {
+ // LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
+ // "verification failed\n");
+ // signalPassFailure();
+ // return;
+ // }
+ // Collect existing UnrealizedConversionCastOps.
+ llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
+ // root->walk(
+ // [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp);
+ // });
+ // Perform a structural type conversion. This will insert
+ // UnrealizedConversionCastOps for type materializations.
+ // auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ // mlir::ValueRange inputs,
+ // mlir::Location loc) -> mlir::Value {
+ // // If single input and both input and output types are vector types,
+ // // and they have same number of elements, insert a shape cast.
+ // // if (inputs.size() == 1) {
+ // // auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
+ // // auto outputTy = dyn_cast<VectorType>(type);
+ // // if (inputTy && outputTy &&
+ // // inputTy.getNumElements() == outputTy.getNumElements()) {
+ // // return vector::ShapeCastOp::create(builder, loc, outputTy,
+ // // inputs[0])
+ // // .getResult();
+ // // }
+ // // }
+ // UnrealizedConversionCastOp castOp =
+ // UnrealizedConversionCastOp::create(builder, loc, type, inputs);
+
+ // // // If inputs is a single vector type and type is also a vector, then
+ // // layout
+ // // // must be propagated.
+ // // if (inputs.size() == 1 && isa<VectorType>(inputs[0].getType()) &&
+ // // isa<VectorType>(type)) {
+ // // auto layout = xegpu::getDistributeLayoutAttr(inputs[0]);
+ // // if (layout)
+ // // xegpu::setDistributeLayoutAttr(castOp->getOpResult(0), layout);
+ // // }
+
+ // return castOp.getResult(0);
+ // };
+ // {
+ // ConversionTarget target(getContext());
+ // TypeConverter typeConverter;
+ // RewritePatternSet patterns(&getContext());
+ // typeConverter.addSourceMaterialization(materializeCast);
+ // typeConverter.addTargetMaterialization(materializeCast);
+ // xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+ // scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+ // patterns, target);
+ // target.addLegalOp<UnrealizedConversionCastOp>();
+ // (void)applyPartialConversion(root, target, std::move(patterns));
+ // }
+ // // Apply the XeGPU subgroup to workitem distribution patterns.
+ // {
+ // ConversionTarget target(getContext());
+ // TypeConverter typeConverter;
+ // typeConverter.addTargetMaterialization(materializeCast);
+ // typeConverter.addSourceMaterialization(materializeCast);
+ // RewritePatternSet patterns(&getContext());
+ // xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ // typeConverter, patterns, target);
+ // target.addLegalOp<UnrealizedConversionCastOp>();
+ // (void)applyPartialConversion(root, target, std::move(patterns));
+ // }
+ // UnrealizedConversionCastOp is legal if it existed before.
+ // target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+ // [&](UnrealizedConversionCastOp op) {
+ // return existingCasts.contains(op);
+ // });
+ // Define a pattern for handling UnrealizedConversionCastOps that were
+ // newly created during the structural type conversion.
+ class ResolveUnrealizedCastPattern
+ : public OpConversionPattern<UnrealizedConversionCastOp> {
+ public:
+ // Pass existsingCasts in the constructor to identify existing casts.
+ ResolveUnrealizedCastPattern(
+ TypeConverter &typeConverter,
+ llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts,
+ MLIRContext &ctx)
+ : OpConversionPattern<UnrealizedConversionCastOp>(typeConverter, &ctx),
+ existingCasts(existingCasts) {}
+ // using
+ // OpConversionPattern<UnrealizedConversionCastOp>::OpConversionPattern;
+ LogicalResult
+ matchAndRewrite(UnrealizedConversionCastOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // If this op existed before, nothing to do.
+ if (existingCasts.contains(op))
+ return failure();
+ // number of inputs and outputs must be 1.
+ if (op.getNumOperands() != 1 || op.getNumResults() != 1)
+ return failure();
+ // Both input and output types must be vector types.
+ auto singleInput = op.getInputs()[0];
+ auto inputTy = dyn_cast<VectorType>(singleInput.getType());
+ auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
+ llvm::errs() << "input ty : " << inputTy << " output ty: " << outputTy
+ << "\n";
+ if (!inputTy || !outputTy)
+ return failure();
+
+ // Check if the defining op of the input is also an
+ // UnrealizedConversionCastOp.
+ auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
+ if (!definingOp)
+ return rewriter.notifyMatchFailure(
+ op, "input defining op is not an UnrealizedConversionCastOp");
+ auto inputOfDefiningOp = definingOp.getInputs()[0];
+ // If the input of the defining op and output type are both vector types
+ // have same number of elements, insert a shape cast.
+ auto inputOfDefiningOpTy =
+ dyn_cast<VectorType>(inputOfDefiningOp.getType());
+ if (inputOfDefiningOpTy && outputTy &&
+ inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
+ auto shapeCast = vector::ShapeCastOp::create(
+ rewriter, op.getLoc(), outputTy, inputOfDefiningOp);
+ rewriter.replaceOp(op, shapeCast.getResult());
+ return success();
+ }
+
+ return rewriter.notifyMatchFailure(
+ op, "unable to resolve unrealized conversion cast");
+ }
+
+ private:
+ llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts;
+ };
+ // Finally, remove unnecessary UnrealizedConversionCastOps.
+ OpBuilder builder(root);
+ root->walk([&](UnrealizedConversionCastOp op) {
+ // If this op existed before, nothing to do.
+ if (existingCasts.contains(op))
+ return;
+ // number of inputs and outputs must be 1.
+ if (op.getNumOperands() != 1 || op.getNumResults() != 1)
+ return;
+ // Both input and output types must be vector types.
+ auto singleInput = op.getInputs()[0];
+ auto inputTy = dyn_cast<VectorType>(singleInput.getType());
+ auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
+ if (!inputTy || !outputTy)
+ return;
+
+ // Check if the defining op of the input is also an
+ // UnrealizedConversionCastOp and it has a single user (which is this op).
+ auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
+ if (!definingOp || !definingOp->hasOneUse())
+ return;
+ auto inputOfDefiningOp = definingOp.getInputs()[0];
+ // If the input of the defining op and output type are both vector types
+ // have same number of elements, insert a shape cast.
+ auto inputOfDefiningOpTy =
+ dyn_cast<VectorType>(inputOfDefiningOp.getType());
+ if (inputOfDefiningOpTy &&
+ inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
+ builder.setInsertionPoint(op);
+ auto shapeCast = vector::ShapeCastOp::create(builder, op.getLoc(),
+ outputTy, inputOfDefiningOp);
+ op.replaceAllUsesWith(ValueRange{shapeCast.getResult()});
+ return;
+ }
+ });
+ // At this point, we will have some dead UnrealizedConversionCastOps. Just
+ // erase them.
+ bool changed = true;
+ while (changed) {
+ changed = false;
+ root->walk([&](UnrealizedConversionCastOp op) {
+ // Skip existing casts.
+ if (existingCasts.contains(op))
+ return;
+ if (op.use_empty()) {
+ op.erase();
+ changed = true;
+ }
+ });
}
}
-void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
- TypeConverter &typeConverter, RewritePatternSet &patterns,
- ConversionTarget &target) {
+void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
+ TypeConverter &typeConverter) {
// Populate type conversions.
// - Any type other than TensorDescType and VectorType are legal as is.
typeConverter.addConversion([](Type type) -> std::optional<Type> {
@@ -325,7 +526,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
return type;
return std::nullopt;
});
- // For TensorDescType, drop the layout attribute if any.
+ // - For TensorDescType, drop the layout attribute if any.
typeConverter.addConversion([](TensorDescType type) -> Type {
if (type.getLayoutAttr()) {
return type.dropLayouts();
@@ -336,29 +537,25 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
// value. If so, convert to the distributed vector type based on the layout.
typeConverter.addConversion([](Value v) -> std::optional<Type> {
auto type = v.getType();
+ // If value is not vector type, nothing to do.
+ if (!isa<VectorType>(type))
+ return std::nullopt;
auto layout = xegpu::getDistributeLayoutAttr(v);
- // If no valid layout, nothing to do.
if (!layout || !layout.isForSubgroup())
- return std::nullopt;
+ return type;
// Vector type is distributed based on lane layout.
- if (isa<VectorType>(type)) {
- auto newTyOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
- if (succeeded(newTyOrFailure))
- return *newTyOrFailure;
- }
- return std::nullopt;
+ auto newTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+ if (failed(newTyOrFailure))
+ return type;
+ return *newTyOrFailure;
});
- // - Materialization casts are only used for testing purposes.
- auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
- mlir::ValueRange inputs,
- mlir::Location loc) -> mlir::Value {
- return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
- .getResult(0);
- };
- typeConverter.addSourceMaterialization(materializeCast);
- typeConverter.addTargetMaterialization(materializeCast);
- // Define legality.
+}
+
+void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ TypeConverter &typeConverter, RewritePatternSet &patterns,
+ ConversionTarget &target) {
+ populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
// - CreateNdDescOp is legal only if its result type has no layout attribute.
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
[&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index addec519c405e..1a1f331efe608 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -21,6 +21,7 @@
#include "mlir/IR/ValueRange.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>
@@ -168,7 +169,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (auto arg = dyn_cast<BlockArgument>(value)) {
auto *parentOp = arg.getOwner()->getParentOp();
- if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
+ if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
if (tiedInit)
return getDistributeLayoutAttr(tiedInit->get());
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index c28ebf1f8bede..03a11908499e1 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -10,6 +10,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -288,6 +289,8 @@ struct TestXeGPUSgToWiDistributeExperimental
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
typeConverter, patterns, target);
+ scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+ patterns, target);
(void)applyPartialConversion(getOperation(), target, std::move(patterns));
}
};
>From 63eed8461498870fc46014c17c817a0af178c299 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 17:42:23 +0000
Subject: [PATCH 09/18] save working version
---
.../XeGPUSgToWiDistributeExperimental.cpp | 211 +++++-------------
1 file changed, 60 insertions(+), 151 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b79cf652083de..8005637da4bf8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -44,17 +44,10 @@ using namespace mlir;
namespace {
-static Value resolveTy(ConversionPatternRewriter &rewriter,
- TypedValue<VectorType> v, VectorType expectedTy) {
- // llvm::errs() << "value:" << v << " expectedTy: " << expectedTy << "\n";
+static Value castValueTo(ConversionPatternRewriter &rewriter,
+ TypedValue<VectorType> v, VectorType expectedTy) {
if (v.getType() == expectedTy)
return v;
- // assert(v.getType().getElementType() == expectedTy.getElementType() &&
- // "element types must match");
- // assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
- // "total number of elements must match");
- // If both types are vector type and number of elements match, insert a shape
- // cast.
if (isa<VectorType>(v.getType()) &&
v.getType().getNumElements() == expectedTy.getNumElements())
return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
@@ -143,8 +136,8 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
op.getL3HintAttr(), /**layout**/ nullptr);
- rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
- expectedWiResultTyOrFailure.value()));
+ rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+ expectedWiResultTyOrFailure.value()));
return success();
}
};
@@ -178,8 +171,8 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
xegpu::StoreNdOp::create(
rewriter, op.getLoc(),
- resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
- supportedWiValueTyOrFailure.value()),
+ castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
+ supportedWiValueTyOrFailure.value()),
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
rewriter.eraseOp(op);
@@ -222,17 +215,17 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
// "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
auto newOp = xegpu::DpasOp::create(
rewriter, op->getLoc(), wiResultTyOrFailure.value(),
- resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
- wiATypeOrFailure.value()),
- resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
- wiBTypeOrFailure.value()),
- resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
- wiResultTyOrFailure.value()),
+ castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
+ wiATypeOrFailure.value()),
+ castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
+ wiBTypeOrFailure.value()),
+ castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
+ wiResultTyOrFailure.value()),
/** layoutA**/ nullptr,
/** layoutB**/ nullptr, /** layoutCd**/ nullptr);
// Explicitly set the new types to enable correct type materializations.
- rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
- expectedWiResultTyOrFailure.value()));
+ rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+ expectedWiResultTyOrFailure.value()));
return success();
}
};
@@ -332,140 +325,56 @@ struct XeGPUSgToWiDistributeExperimentalPass
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
- // llvm::errs() << "Running XeGPUSgToWiDistributeExperimentalPass\n";
// Verify if all XeGPU anchor ops and vector ops have result layouts.
Operation *root = getOperation();
- // if (failed(verifyLayouts(root))) {
- // LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
- // "verification failed\n");
- // signalPassFailure();
- // return;
- // }
- // Collect existing UnrealizedConversionCastOps.
+ if (failed(verifyLayouts(root))) {
+ LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
+ "verification failed\n");
+ signalPassFailure();
+ return;
+ }
+ // Collect existing UnrealizedConversionCastOps. These must be preserved.
llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
- // root->walk(
- // [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp);
- // });
- // Perform a structural type conversion. This will insert
- // UnrealizedConversionCastOps for type materializations.
- // auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
- // mlir::ValueRange inputs,
- // mlir::Location loc) -> mlir::Value {
- // // If single input and both input and output types are vector types,
- // // and they have same number of elements, insert a shape cast.
- // // if (inputs.size() == 1) {
- // // auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
- // // auto outputTy = dyn_cast<VectorType>(type);
- // // if (inputTy && outputTy &&
- // // inputTy.getNumElements() == outputTy.getNumElements()) {
- // // return vector::ShapeCastOp::create(builder, loc, outputTy,
- // // inputs[0])
- // // .getResult();
- // // }
- // // }
- // UnrealizedConversionCastOp castOp =
- // UnrealizedConversionCastOp::create(builder, loc, type, inputs);
-
- // // // If inputs is a single vector type and type is also a vector, then
- // // layout
- // // // must be propagated.
- // // if (inputs.size() == 1 && isa<VectorType>(inputs[0].getType()) &&
- // // isa<VectorType>(type)) {
- // // auto layout = xegpu::getDistributeLayoutAttr(inputs[0]);
- // // if (layout)
- // // xegpu::setDistributeLayoutAttr(castOp->getOpResult(0), layout);
- // // }
-
- // return castOp.getResult(0);
- // };
- // {
- // ConversionTarget target(getContext());
- // TypeConverter typeConverter;
- // RewritePatternSet patterns(&getContext());
- // typeConverter.addSourceMaterialization(materializeCast);
- // typeConverter.addTargetMaterialization(materializeCast);
- // xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
- // scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
- // patterns, target);
- // target.addLegalOp<UnrealizedConversionCastOp>();
- // (void)applyPartialConversion(root, target, std::move(patterns));
- // }
- // // Apply the XeGPU subgroup to workitem distribution patterns.
- // {
- // ConversionTarget target(getContext());
- // TypeConverter typeConverter;
- // typeConverter.addTargetMaterialization(materializeCast);
- // typeConverter.addSourceMaterialization(materializeCast);
- // RewritePatternSet patterns(&getContext());
- // xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
- // typeConverter, patterns, target);
- // target.addLegalOp<UnrealizedConversionCastOp>();
- // (void)applyPartialConversion(root, target, std::move(patterns));
- // }
- // UnrealizedConversionCastOp is legal if it existed before.
- // target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
- // [&](UnrealizedConversionCastOp op) {
- // return existingCasts.contains(op);
- // });
- // Define a pattern for handling UnrealizedConversionCastOps that were
- // newly created during the structural type conversion.
- class ResolveUnrealizedCastPattern
- : public OpConversionPattern<UnrealizedConversionCastOp> {
- public:
- // Pass existsingCasts in the constructor to identify existing casts.
- ResolveUnrealizedCastPattern(
- TypeConverter &typeConverter,
- llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts,
- MLIRContext &ctx)
- : OpConversionPattern<UnrealizedConversionCastOp>(typeConverter, &ctx),
- existingCasts(existingCasts) {}
- // using
- // OpConversionPattern<UnrealizedConversionCastOp>::OpConversionPattern;
- LogicalResult
- matchAndRewrite(UnrealizedConversionCastOp op, OpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- // If this op existed before, nothing to do.
- if (existingCasts.contains(op))
- return failure();
- // number of inputs and outputs must be 1.
- if (op.getNumOperands() != 1 || op.getNumResults() != 1)
- return failure();
- // Both input and output types must be vector types.
- auto singleInput = op.getInputs()[0];
- auto inputTy = dyn_cast<VectorType>(singleInput.getType());
- auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
- llvm::errs() << "input ty : " << inputTy << " output ty: " << outputTy
- << "\n";
- if (!inputTy || !outputTy)
- return failure();
-
- // Check if the defining op of the input is also an
- // UnrealizedConversionCastOp.
- auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
- if (!definingOp)
- return rewriter.notifyMatchFailure(
- op, "input defining op is not an UnrealizedConversionCastOp");
- auto inputOfDefiningOp = definingOp.getInputs()[0];
- // If the input of the defining op and output type are both vector types
- // have same number of elements, insert a shape cast.
- auto inputOfDefiningOpTy =
- dyn_cast<VectorType>(inputOfDefiningOp.getType());
- if (inputOfDefiningOpTy && outputTy &&
- inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
- auto shapeCast = vector::ShapeCastOp::create(
- rewriter, op.getLoc(), outputTy, inputOfDefiningOp);
- rewriter.replaceOp(op, shapeCast.getResult());
- return success();
- }
-
- return rewriter.notifyMatchFailure(
- op, "unable to resolve unrealized conversion cast");
- }
-
- private:
- llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts;
+ root->walk(
+ [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });
+ // Perform a structural type conversion to convert structural ops to have WI
+ // types. This will insert UnrealizedConversionCastOps to make the IR
+ // valid.
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ UnrealizedConversionCastOp castOp =
+ UnrealizedConversionCastOp::create(builder, loc, type, inputs);
+ return castOp.getResult(0);
};
- // Finally, remove unnecessary UnrealizedConversionCastOps.
+ {
+ ConversionTarget target(getContext());
+ TypeConverter typeConverter;
+ RewritePatternSet patterns(&getContext());
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
+ xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+ scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+ patterns, target);
+ target.addLegalOp<UnrealizedConversionCastOp>();
+ (void)applyPartialConversion(root, target, std::move(patterns));
+ }
+ // Apply the XeGPU subgroup to workitem distribution patterns.
+ {
+ ConversionTarget target(getContext());
+ TypeConverter typeConverter;
+ typeConverter.addTargetMaterialization(materializeCast);
+ typeConverter.addSourceMaterialization(materializeCast);
+ RewritePatternSet patterns(&getContext());
+ xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ typeConverter, patterns, target);
+ target.addLegalOp<UnrealizedConversionCastOp>();
+ (void)applyPartialConversion(root, target, std::move(patterns));
+ }
+ // Structural type conversion can generate some redundant
+ // UnrealizedConversionCastOps to materialize the SG type from type converted
+ // WI type. These are redundant at this point and can be eliminated by
+ // inserting shape casts instead.
OpBuilder builder(root);
root->walk([&](UnrealizedConversionCastOp op) {
// If this op existed before, nothing to do.
>From 2cd613fc7b65fe57dcd7d36fdbd3c194052fab2f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 20:25:59 +0000
Subject: [PATCH 10/18] pack and unpack support
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 12 +++++++
.../XeGPUSgToWiDistributeExperimental.cpp | 12 +++++++
.../Transforms/XeGPUSubgroupDistribute.cpp | 32 ++-----------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 25 +++++++++++++++
4 files changed, 51 insertions(+), 30 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index d327a431d6ec4..9f391332df37a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -26,6 +26,10 @@ namespace xegpu {
class DistributeLayoutAttr;
class LayoutAttr;
class TensorDescType;
+
+namespace uArch {
+class uArch;
+} // namespace uArch
} // namespace xegpu
namespace xegpu {
@@ -207,6 +211,14 @@ void recoverTemporaryLayoutsDeprecated(Operation *op);
/// a layout attribute.
bool recoverTemporaryLayouts(Operation *rootOp);
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+/// TODO: Move to target info.
+bool requirePacked(const LayoutAttr layout);
+
+/// Helper function to check if the layout requires a transpose effect.
+bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch);
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 8005637da4bf8..c83c9a53c8734 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
@@ -120,6 +121,12 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
if (op.getTensorDescType().getLayout() != layout)
return rewriter.notifyMatchFailure(
op, "conflicting layout attributes on tensor descriptor and anchor");
+ auto uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ op, "xegpu::LoadNdOp require target attribute attached to "
+ "determine transpose "
+ "requirement");
auto supportedWiResultTyOrFailure =
xegpu::getDistributedVectorType(op.getTensorDescType());
auto expectedWiResultTyOrFailure =
@@ -136,6 +143,11 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
op.getL3HintAttr(), /**layout**/ nullptr);
+ // Set the packed attribute if the layout requires it.
+ newOp.setPacked(xegpu::requirePacked(cast<xegpu::LayoutAttr>(layout)));
+ // Set the transpose attribute if the layout requires it.
+ if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))
+ newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
expectedWiResultTyOrFailure.value()));
return success();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8beadeb5da309..141247931a233 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -103,34 +103,6 @@ static Value resolveDistributedTy(Value orig, T expected,
return orig;
}
-/// Helper function to check if the layout is packed. Layout is packed if it is
-/// 2D and lane_data[0] != 1 (data packed from col dimension).
-/// TODO: Move to target info.
-static bool requirePacked(const xegpu::LayoutAttr layout) {
- if (!layout)
- return false;
- auto laneData = layout.getEffectiveLaneDataAsInt();
- if (laneData.size() != 2)
- return false;
- return laneData[0] != 1;
-}
-
-/// Helper function to check if the layout requires a transpose effect.
-static bool requireTranspose(const xegpu::LayoutAttr layout,
- const xegpu::uArch::uArch *uArch) {
- // Return false for unsupported targets.
- // TODO: Add more support or move to target info.
- if (uArch->getName().equals_insensitive("pvc") &&
- uArch->getName().equals_insensitive("bmg"))
- return false;
- if (!layout)
- return false;
- auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
- if (laneLayout.size() != 2)
- return false;
- return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
-}
-
/// Given a vector type and its distributed vector type, return the list of
/// dimensions that are distributed.
static SmallVector<int64_t> getDistributedDims(VectorType originalType,
@@ -533,9 +505,9 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
newLoadOperands, loadOp->getAttrs());
xegpu::removeLayoutAttrs(newLoadOp);
// Set the packed attribute if the layout requires it.
- newLoadOp.setPacked(requirePacked(layout));
+ newLoadOp.setPacked(xegpu::requirePacked(layout));
// Set the transpose attribute if the layout requires it.
- if (requireTranspose(layout, uArch))
+ if (xegpu::requireTranspose(layout, uArch))
newLoadOp.setTranspose(
DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 1a1f331efe608..062d0a872bc82 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -16,6 +16,7 @@
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/ValueRange.h"
@@ -762,3 +763,27 @@ template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
template int
xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
ArrayRef<unsigned> candidateMultiples);
+
+bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
+ if (!layout)
+ return false;
+ auto laneData = layout.getEffectiveLaneDataAsInt();
+ if (laneData.size() != 2)
+ return false;
+ return laneData[0] != 1;
+}
+
+bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
+ const xegpu::uArch::uArch *uArch) {
+ // Return false for unsupported targets.
+ // TODO: Add more support or move to target info.
+ if (uArch->getName().equals_insensitive("pvc") &&
+ uArch->getName().equals_insensitive("bmg"))
+ return false;
+ if (!layout)
+ return false;
+ auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+ if (laneLayout.size() != 2)
+ return false;
+ return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
+}
>From 5e9b1d0497cba45bee9486cccaca0f38d374e48a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 22:45:17 +0000
Subject: [PATCH 11/18] save work
---
.../XeGPUSgToWiDistributeExperimental.cpp | 39 ++++++---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 87 ++++++++++++++++++-
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 45 ++++++++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 14 ++-
4 files changed, 171 insertions(+), 14 deletions(-)
create mode 100644 mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index c83c9a53c8734..efe8cbe8caedc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -86,8 +86,7 @@ static LogicalResult verifyLayouts(Operation *root) {
return walkResult.wasInterrupted() ? failure() : success();
}
-struct CreateNdDescOpPattern
- : public OpConversionPattern<xegpu::CreateNdDescOp> {
+struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
LogicalResult
@@ -106,7 +105,7 @@ struct CreateNdDescOpPattern
}
};
-struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
+struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
LogicalResult
@@ -154,7 +153,7 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
}
};
-struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
+struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
LogicalResult
@@ -192,7 +191,7 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
}
};
-struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
+struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
LogicalResult
@@ -242,8 +241,8 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
}
};
-struct ElementWiseOpPattern : public ConversionPattern {
- ElementWiseOpPattern(TypeConverter &typeConverter, MLIRContext *ctx)
+struct WgToWiElementWise : public ConversionPattern {
+ WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
: ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
LogicalResult
@@ -287,7 +286,7 @@ struct ElementWiseOpPattern : public ConversionPattern {
}
};
-struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
+struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
LogicalResult
@@ -327,6 +326,26 @@ struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
}
};
+struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
+ using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::PrefetchNdOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+ // If no layout, nothing to do.
+ if (!layout)
+ return failure();
+
+ xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), adaptor.getTensorDesc(),
+ op.getMixedOffsets(), op.getL1HintAttr(),
+ op.getL2HintAttr(), op.getL3HintAttr(),
+ /**layout**/ nullptr);
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -521,7 +540,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
- patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
- DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
+ patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
+ WgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
typeConverter, patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index c89586f450164..715b327ba62a1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,7 +2,12 @@
// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
// --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+
+
gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @create_nd_tdesc
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[TD:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
@@ -10,6 +15,9 @@ gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
gpu.return
}
+// CHECK-LABEL: gpu.func @cerate_nd_tedesc_nonmemref_source
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[TD:.*]] = xegpu.create_nd_tdesc %{{.*}}, shape : [256, 256], strides : [256, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
@@ -17,6 +25,10 @@ gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
gpu.return
}
+// CHECK-LABEL: gpu.func @load_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
gpu.func @load_nd() {
%c0 = arith.constant 0 : index
%0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -25,6 +37,36 @@ gpu.func @load_nd() {
gpu.return
}
+// CHECK-LABEL: gpu.func @load_nd_packed
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
+gpu.func @load_nd_packed() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @load_nd_transpose
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf32> to vector<1x8xf32>
+gpu.func @load_nd_transpose() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @store_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
+// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[CAST2]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[CAST3]], %{{.*}}[%[[C0]], %[[C0]]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.func @store_nd() {
%c0 = arith.constant 0 : index
%0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -36,7 +78,20 @@ gpu.func @store_nd() {
gpu.return
}
-gpu.func @dpas_op() {
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
+// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
+// CHECK: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
+// CHECK: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
+// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[CAST6]], %[[CAST5]], %[[CAST4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK: %[[CAST7:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK: gpu.return
+gpu.func @dpas() {
%c0 = arith.constant 0 : index
%0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -54,7 +109,14 @@ gpu.func @dpas_op() {
gpu.return
}
-gpu.func @elementwise_op() {
+// CHECK-LABEL: gpu.func @elementwise
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf32> -> vector<16xf32>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf32> to vector<16x1xf32>
+// CHECK: %[[ADD:.*]] = arith.addf %[[CAST2]], %[[CST]] : vector<16x1xf32>
+// CHECK: gpu.return
+gpu.func @elementwise() {
%c0 = arith.constant 0 : index
%0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
dense<1.0> : vector<16x16xf32>
@@ -66,4 +128,25 @@ gpu.func @elementwise_op() {
: vector<16x16xf32>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @arith_constant
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
+// CHECK: gpu.return
+gpu.func @arith_constant() {
+ %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ dense<1.0> : vector<16x16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @prefetch_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: xegpu.prefetch_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16>
+// CHECK: gpu.return
+gpu.func @prefetch_nd() {
+ %c0 = arith.constant 0 : index
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.prefetch_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
new file mode 100644
index 0000000000000..c54d56128f21a
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -0,0 +1,45 @@
+gpu.module @xevm_module{
+gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c8 = arith.constant 8 : index
+ %c1024 = arith.constant 1024 : index
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %0 = arith.muli %block_id_x, %c8 : index
+ %1 = arith.muli %block_id_y, %c16 : index
+ %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %3 = xegpu.load_nd %2[%0, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+ %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+ %7 = xegpu.load_nd %5[%0, %arg3]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+ %8 = xegpu.load_nd %6[%arg3, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+ %9 = xegpu.dpas %7, %8, %arg4
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+ scf.yield %9 : vector<8x16xf32>
+ } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 03a11908499e1..99cc09f30c821 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -254,6 +254,9 @@ struct TestXeGPUSGDistribute
}
};
+/// This test pass is intended to test the subgroup to workitem distribution of
+/// xegpu/vector/arith operations in isolation, it does not handle any
+/// structural ops like scf.for etc.
struct TestXeGPUSgToWiDistributeExperimental
: public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
OperationPass<gpu::GPUModuleOp>> {
@@ -285,12 +288,19 @@ struct TestXeGPUSgToWiDistributeExperimental
void runOnOperation() override {
MLIRContext *ctx = &getContext();
TypeConverter typeConverter;
+ // Define type materializations using UnrealizedConversionCastOp.
+ auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+ mlir::ValueRange inputs,
+ mlir::Location loc) -> mlir::Value {
+ return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+ .getResult(0);
+ };
+ typeConverter.addSourceMaterialization(materializeCast);
+ typeConverter.addTargetMaterialization(materializeCast);
ConversionTarget target(*ctx);
RewritePatternSet patterns(ctx);
xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
typeConverter, patterns, target);
- scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
- patterns, target);
(void)applyPartialConversion(getOperation(), target, std::move(patterns));
}
};
>From 55436910a83aee6fa9a346d89c81b07de29e64d6 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 23:40:55 +0000
Subject: [PATCH 12/18] fix
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 2 +-
.../XeGPUSgToWiDistributeExperimental.cpp | 11 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 4 +-
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 174 ++++++++++++++++++
4 files changed, 186 insertions(+), 5 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 9f391332df37a..e40d4eb6f8b9a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -28,7 +28,7 @@ class LayoutAttr;
class TensorDescType;
namespace uArch {
-class uArch;
+struct uArch;
} // namespace uArch
} // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index efe8cbe8caedc..ba960ee00ed5d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -26,6 +26,7 @@
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
@@ -53,7 +54,7 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
v.getType().getNumElements() == expectedTy.getNumElements())
return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
- // else create an unrealized cast.
+ // Else create an unrealized cast.
auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),
expectedTy, ValueRange{v});
return newOp.getResult(0);
@@ -272,7 +273,13 @@ struct WgToWiElementWise : public ConversionPattern {
VectorType newResultType = wiShapeOrFailure.value();
OperationState state(op->getLoc(), op->getName());
- state.addOperands(operands);
+ // Cast the types of operands to the expected workitem types.
+ SmallVector<Value> newOperands =
+ llvm::map_to_vector(operands, [&](Value v) {
+ return castValueTo(rewriter, cast<TypedValue<VectorType>>(v),
+ newResultType);
+ });
+ state.addOperands(newOperands);
state.addTypes(newResultType);
// Copy all attributes except for DistributeLayoutAttr.
for (auto attr : op->getAttrs()) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 715b327ba62a1..e9d374c8ca2f1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
+// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index c54d56128f21a..35aa83dfb34af 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -1,3 +1,31 @@
+// RUN: mlir-opt --allow-unregistered-dialect --xevm-attach-target='module=xevm_* chip=pvc' \
+// RUN: --xegpu-sg-to-wi-distribute-experimental --split-input-file %s --canonicalize --cse | FileCheck %s
+
+// CHECK-LABEL: gpu.func @gemm
+// CHECK-DAG : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK : %[[BID_X:.*]] = gpu.block_id x
+// CHECK : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK : %[[CAST_FOR:.*]] = vector.shape_cast %[[FOR]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : xegpu.store_nd %[[CAST_FOR]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK : gpu.return
gpu.module @xevm_module{
gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
%c0 = arith.constant 0 : index
@@ -42,4 +70,150 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @gemm_with_preop
+// CHECK-DAG : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x1xbf16>
+// CHECK : %[[BID_X:.*]] = gpu.block_id x
+// CHECK : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK : %[[CAST_A:.*]] = vector.shape_cast %[[LOAD_A]] : vector<8xbf16> to vector<8x1xbf16>
+// CHECK : %[[PREOP:.*]] = arith.addf %[[CAST_A]], %[[CST]] : vector<8x1xbf16>
+// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : %[[CAST_PREOP:.*]] = vector.shape_cast %[[PREOP]] : vector<8x1xbf16> to vector<8xbf16>
+// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[CAST_PREOP]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK : %[[CAST_FOR:.*]] = vector.shape_cast %[[FOR]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : xegpu.store_nd %[[CAST_FOR]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK : gpu.return
+gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c8 = arith.constant 8 : index
+ %c1024 = arith.constant 1024 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.0> : vector<8x16xbf16>
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %0 = arith.muli %block_id_x, %c8 : index
+ %1 = arith.muli %block_id_y, %c16 : index
+ %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %3 = xegpu.load_nd %2[%0, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+ %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+ %7 = xegpu.load_nd %5[%0, %arg3]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+ %preop = arith.addf %7, %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>
+ %8 = xegpu.load_nd %6[%arg3, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+ %9 = xegpu.dpas %preop, %8, %arg4
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+ scf.yield %9 : vector<8x16xf32>
+ } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @gemm_with_postop
+// CHECK-DAG : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK : %[[BID_X:.*]] = gpu.block_id x
+// CHECK : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK : scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK : %[[POSTOP:.*]] = math.exp %[[FOR]] : vector<8x1xf32>
+// CHECK : %[[CAST_POSTOP:.*]] = vector.shape_cast %[[POSTOP]] : vector<8x1xf32> to vector<8xf32>
+// CHECK : xegpu.store_nd %[[CAST_POSTOP]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.func @gemm_with_postop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c8 = arith.constant 8 : index
+ %c1024 = arith.constant 1024 : index
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %0 = arith.muli %block_id_x, %c8 : index
+ %1 = arith.muli %block_id_y, %c16 : index
+ %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %3 = xegpu.load_nd %2[%0, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+ -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+ %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+ %7 = xegpu.load_nd %5[%0, %arg3]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+ %8 = xegpu.load_nd %6[%arg3, %1]
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+ %9 = xegpu.dpas %7, %8, %arg4
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+ scf.yield %9 : vector<8x16xf32>
+ } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ %postop = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
+ xegpu.store_nd %postop, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+}
+
}
>From a2ce36b4736bbf43bf608ad9664982cfa5a5e14c Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 23:53:12 +0000
Subject: [PATCH 13/18] fix
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index e9d374c8ca2f1..0e9843f4626d4 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -80,14 +80,14 @@ gpu.func @store_nd() {
// CHECK-LABEL: gpu.func @dpas
// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
-// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
-// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
-// CHECK: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
-// CHECK: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK-DAG: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
+// CHECK-DAG: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
+// CHECK-DAG: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[CAST6]], %[[CAST5]], %[[CAST4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
// CHECK: %[[CAST7:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
// CHECK: gpu.return
>From 33b11c86b3abeedbd43da4e7e97e21ec381e861a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 23 Jan 2026 22:53:55 +0000
Subject: [PATCH 14/18] add comments
---
.../XeGPUSgToWiDistributeExperimental.cpp | 23 +++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index ba960ee00ed5d..88079b3edddf6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -46,10 +46,13 @@ using namespace mlir;
namespace {
+/// Casts the given vector value `v` to the expected vector type `expectedTy`.
static Value castValueTo(ConversionPatternRewriter &rewriter,
TypedValue<VectorType> v, VectorType expectedTy) {
+ // If the type matches, simply return the value itself.
if (v.getType() == expectedTy)
return v;
+ // If only shape differs, use shape cast.
if (isa<VectorType>(v.getType()) &&
v.getType().getNumElements() == expectedTy.getNumElements())
return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
@@ -60,6 +63,8 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
return newOp.getResult(0);
}
+/// Checks if all XeGPU anchor ops and vector results have valid layouts.
+/// TODO: This function can be removed once the full layout refactoring is done.
static LogicalResult verifyLayouts(Operation *root) {
auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
@@ -87,6 +92,8 @@ static LogicalResult verifyLayouts(Operation *root) {
return walkResult.wasInterrupted() ? failure() : success();
}
+/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
+/// op. This simply drops the layout attribute from the tensor descriptor type.
struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -106,6 +113,9 @@ struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
}
};
+/// Distributes a subgroup-level LoadNd op to workitem-level LoadNd op. Output
+/// of workitem-level LoadNd op is 1D. ShapeCast is added to restore the
+/// original rank.
struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
@@ -154,6 +164,9 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
}
};
+/// Distributes a subgroup-level StoreNd op to workitem-level StoreNd op. Stored
+/// value in workitem-level StoreNd op is 1D. ShapeCast is added to cast the
+/// incoming value to 1D.
struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
@@ -192,6 +205,9 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
}
};
+/// Distributes a subgroup-level Dpas op to workitem-level Dpas op. All inpputs
+/// and output of workitem-level Dpas op are 1D. Necessary casts are added to
+/// convert the inputs and output to/from 1D.
struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
@@ -223,8 +239,6 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
return rewriter.notifyMatchFailure(
op, "unable to compute expected workitem vector type for DpasOp from "
"lane layout");
- // llvm::errs() << "adaptor acc type: " << adaptor.getAcc().getType() <<
- // "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
auto newOp = xegpu::DpasOp::create(
rewriter, op->getLoc(), wiResultTyOrFailure.value(),
castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
@@ -242,6 +256,8 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
}
};
+/// Distributes elementwise ops to workitem-level elementwise ops. This
+/// currently handles elementwise ops with single result only.
struct WgToWiElementWise : public ConversionPattern {
WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
: ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
@@ -293,6 +309,8 @@ struct WgToWiElementWise : public ConversionPattern {
}
};
+/// Distributes a subgroup-level arith ConstantOp to workitem-level arith
+/// ConstantOp.
struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
@@ -333,6 +351,7 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
}
};
+/// Distributes a subgroup-level PrefetchNd op to workitem-level PrefetchNd op.
struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
>From 1f2eee1d5be8be4b4895307684a6301aa2c3cc62 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 23 Jan 2026 22:59:57 +0000
Subject: [PATCH 15/18] add comments
---
.../XeGPUSgToWiDistributeExperimental.cpp | 26 +++++++++++--------
1 file changed, 15 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 88079b3edddf6..4e38c7de6bd8e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -64,7 +64,6 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
}
/// Checks if all XeGPU anchor ops and vector results have valid layouts.
-/// TODO: This function can be removed once the full layout refactoring is done.
static LogicalResult verifyLayouts(Operation *root) {
auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
@@ -383,6 +382,7 @@ struct XeGPUSgToWiDistributeExperimentalPass
void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
// Verify if all XeGPU anchor ops and vector ops have result layouts.
+ // TODO: This can be removed once the full layout refactoring is done.
Operation *root = getOperation();
if (failed(verifyLayouts(root))) {
LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
@@ -432,6 +432,11 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
// UnrealizedConversionCastOps to materialize the SG type from type converted
// WI type. These are redundant at this point and can be eliminated by
// inserting shape casts instead.
+ // Example:
+ // %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32> to vector<16x16xf32>
+ // %2 = UnrealizedConversionCastOp %1 : vector<16x16xf32> to vector<16xf32>
+ // This can be replaced with:
+ // %2 = vector.shape_cast %0 : vector<16x1xf32> to vector<16xf32>
OpBuilder builder(root);
root->walk([&](UnrealizedConversionCastOp op) {
// If this op existed before, nothing to do.
@@ -485,22 +490,21 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
TypeConverter &typeConverter) {
- // Populate type conversions.
- // - Any type other than TensorDescType and VectorType are legal as is.
+ // Any type other than TensorDescType and VectorType are legal as is.
typeConverter.addConversion([](Type type) -> std::optional<Type> {
if (!isa<TensorDescType, VectorType>(type))
return type;
return std::nullopt;
});
- // - For TensorDescType, drop the layout attribute if any.
+ // For TensorDescType, drop the layout attribute if any.
typeConverter.addConversion([](TensorDescType type) -> Type {
if (type.getLayoutAttr()) {
return type.dropLayouts();
}
return type;
});
- // - For VectorType, check if there is a distribute layout attribute on the
- // value. If so, convert to the distributed vector type based on the layout.
+ // For VectorType, check if there is a distribute layout attribute on the
+ // value. If so, convert to the distributed vector type based on the layout.
typeConverter.addConversion([](Value v) -> std::optional<Type> {
auto type = v.getType();
// If value is not vector type, nothing to do.
@@ -522,26 +526,26 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target) {
populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
- // - CreateNdDescOp is legal only if its result type has no layout attribute.
+ // CreateNdDescOp is legal only if its result type has no layout attribute.
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
[&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
- // - Any anchor XeGPU op is legal only if it has no anchor layout.
+ // Any anchor XeGPU op is legal only if it has no anchor layout.
target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
if (!anchorOp)
return true;
return !anchorOp.getAnchorLayout();
});
+ // Arith constants are legal only if they have no temporary layout attribute.
target.addDynamicallyLegalOp<arith::ConstantOp>(
[=](arith::ConstantOp op) -> bool {
// If the result type is not a vector, it's legal.
if (!isa<VectorType>(op.getResult().getType()))
return true;
- // For vector result types, check if it has a layout attribute.
return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
});
- // - In math and arith dialects, only handle elementwise ops with a single
- // result and with a result layout attribute.
+ // In math and arith dialects, only handle elementwise ops with a single
+ // result and with a result layout attribute.
target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
[=](Operation *op) -> std::optional<bool> {
// Only handle elementwise mappable ops
>From 678162c152d790d1d7930b4c671425f25adda842 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 24 Jan 2026 00:06:58 +0000
Subject: [PATCH 16/18] add comments
---
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 60 +++++++++----------
1 file changed, 30 insertions(+), 30 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index 35aa83dfb34af..9172cd3018b71 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -6,19 +6,19 @@
// CHECK-DAG : %[[C16:.*]] = arith.constant 16 : index
// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
-// CHECK : %[[BID_X:.*]] = gpu.block_id x
-// CHECK : %[[BID_Y:.*]] = gpu.block_id y
-// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG : %[[BID_X:.*]] = gpu.block_id x
+// CHECK-DAG : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK-DAG : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
// CHECK : scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
@@ -77,21 +77,21 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
// CHECK : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x1xbf16>
-// CHECK : %[[BID_X:.*]] = gpu.block_id x
-// CHECK : %[[BID_Y:.*]] = gpu.block_id y
-// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG : %[[BID_X:.*]] = gpu.block_id x
+// CHECK-DAG : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK-DAG : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
// CHECK : %[[CAST_A:.*]] = vector.shape_cast %[[LOAD_A]] : vector<8xbf16> to vector<8x1xbf16>
// CHECK : %[[PREOP:.*]] = arith.addf %[[CAST_A]], %[[CST]] : vector<8x1xbf16>
-// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
// CHECK : %[[CAST_PREOP:.*]] = vector.shape_cast %[[PREOP]] : vector<8x1xbf16> to vector<8xbf16>
// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[CAST_PREOP]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
@@ -151,19 +151,19 @@ gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024
// CHECK-DAG : %[[C16:.*]] = arith.constant 16 : index
// CHECK-DAG : %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG : %[[C1024:.*]] = arith.constant 1024 : index
-// CHECK : %[[BID_X:.*]] = gpu.block_id x
-// CHECK : %[[BID_Y:.*]] = gpu.block_id y
-// CHECK : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG : %[[BID_X:.*]] = gpu.block_id x
+// CHECK-DAG : %[[BID_Y:.*]] = gpu.block_id y
+// CHECK-DAG : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
// CHECK : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
// CHECK : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG : %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG : %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG : %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
// CHECK : %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
// CHECK : %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
// CHECK : scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
>From 5a2bf18a0fa4b1990105cb6a072a15cb99a92145 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 27 Jan 2026 19:42:57 +0000
Subject: [PATCH 17/18] fix
---
.../XeGPUSgToWiDistributeExperimental.cpp | 21 +++----------------
1 file changed, 3 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 4e38c7de6bd8e..67f8cf633b849 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -288,13 +288,7 @@ struct WgToWiElementWise : public ConversionPattern {
VectorType newResultType = wiShapeOrFailure.value();
OperationState state(op->getLoc(), op->getName());
- // Cast the types of operands to the expected workitem types.
- SmallVector<Value> newOperands =
- llvm::map_to_vector(operands, [&](Value v) {
- return castValueTo(rewriter, cast<TypedValue<VectorType>>(v),
- newResultType);
- });
- state.addOperands(newOperands);
+ state.addOperands(operands);
state.addTypes(newResultType);
// Copy all attributes except for DistributeLayoutAttr.
for (auto attr : op->getAttrs()) {
@@ -413,16 +407,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
patterns, target);
- target.addLegalOp<UnrealizedConversionCastOp>();
- (void)applyPartialConversion(root, target, std::move(patterns));
- }
- // Apply the XeGPU subgroup to workitem distribution patterns.
- {
- ConversionTarget target(getContext());
- TypeConverter typeConverter;
- typeConverter.addTargetMaterialization(materializeCast);
- typeConverter.addSourceMaterialization(materializeCast);
- RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
typeConverter, patterns, target);
target.addLegalOp<UnrealizedConversionCastOp>();
@@ -453,7 +437,8 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
return;
// Check if the defining op of the input is also an
- // UnrealizedConversionCastOp and it has a single user (which is this op).
+ // UnrealizedConversionCastOp and it has a single user (which is this
+ // op).
auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
if (!definingOp || !definingOp->hasOneUse())
return;
>From d548f9507674c4156003a083fdfedc3406c386dc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 28 Jan 2026 21:52:11 +0000
Subject: [PATCH 18/18] address issues
---
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 2 --
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 6 +++---
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index e40d4eb6f8b9a..700db5f9dd9be 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -71,8 +71,6 @@ FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
/// according to the lane_layout. We simply divide each dimension of tensor
/// descriptor shape by corresponding lane_layout dimension. If
/// array_length > 1, that is appended to the front of the distributed shape.
-/// NOTE: This is the vector type that will be returned by the
-/// gpu.warp_execute_on_lane0 op.
///
/// Examples:
/// | original vector shape | lane_layout | distributed vector shape |
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 67f8cf633b849..4ae858363d5b6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -257,8 +257,8 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
/// Distributes elementwise ops to workitem-level elementwise ops. This
/// currently handles elementwise ops with single result only.
-struct WgToWiElementWise : public ConversionPattern {
- WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
+struct SgToWiElementWise : public ConversionPattern {
+ SgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
: ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
LogicalResult
@@ -556,6 +556,6 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
- WgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
+ SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
typeConverter, patterns.getContext());
}
More information about the Mlir-commits
mailing list