[Mlir-commits] [mlir] [mlir][xegpu] Add more tests in XeGPU subgroup distribution. (PR #162543)
Charitha Saumya
llvmlistbot at llvm.org
Fri Oct 10 09:27:26 PDT 2025
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/162543
>From 1dca2cd22e82200db0a4964c0abb609df9c8f889 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 3 Oct 2025 21:57:28 +0000
Subject: [PATCH 1/5] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 4 +++
.../Transforms/XeGPUSubgroupDistribute.cpp | 10 +++++--
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 30 +++++++++++++++++++
3 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index b74c15e5b7ac1..a480195eebd00 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -64,6 +64,10 @@ void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU SIMT distribution into `patterns`.
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
+/// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
+void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU workgroup to subgroup distribution into
+/// `patterns`.
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f1dbc5ddb2022..26770b3c003ea 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -195,8 +195,7 @@ static bool requireTranspose(const xegpu::LayoutAttr layout,
/// }
/// return %0
/// }
-struct MoveFuncBodyToWarpExecuteOnLane0
- : public OpRewritePattern<gpu::GPUFuncOp> {
+struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
@@ -1447,6 +1446,11 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
/*pattern benefit=*/highPatternBenefit);
}
+void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
+}
+
void XeGPUSubgroupDistributePass::runOnOperation() {
// Step 1: Attach layouts to op operands.
// TODO: Following assumptions are made:
@@ -1473,7 +1477,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
// gpu.warp_execute_on_lane_0 operation.
{
RewritePatternSet patterns(&getContext());
- patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+ xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
signalPassFailure();
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 6ba7a004b7d31..7176f9b946a17 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -247,6 +247,36 @@ struct TestXeGPUSGDistribute
}
};
+struct TestXeGPUMoveFuncBodyToWarpOp
+ : public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp,
+ OperationPass<gpu::GPUModuleOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUMoveFuncBodyToWarpOp)
+
+ StringRef getArgument() const final {
+ return "test-xegpu-move-func-to-warp-op";
+ }
+
+ StringRef getDescription() const final {
+ return "Test the implementation of XeGPU move gpu function body to "
+ "WarpExecuteOnLane0 op.";
+ }
+
+ void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
+ registry.insert<xegpu::XeGPUDialect>();
+ registry.insert<gpu::GPUDialect>();
+ }
+
+ TestXeGPUMoveFuncBodyToWarpOp() = default;
+ TestXeGPUMoveFuncBodyToWarpOp(const TestXeGPUMoveFuncBodyToWarpOp &pass) =
+ default;
+
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+ }
+};
+
struct TestXeGPULayoutInterface
: public PassWrapper<TestXeGPULayoutInterface,
OperationPass<gpu::GPUModuleOp>> {
>From b7174ed554d9a17370deed7b401598a2e70439fc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 8 Oct 2025 19:21:44 +0000
Subject: [PATCH 2/5] add test pass for move warp op
---
.../XeGPU/move-gpu-func-to-warp-op.mlir | 63 +++++++++++++++++++
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 2 +
2 files changed, 65 insertions(+)
create mode 100644 mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
new file mode 100644
index 0000000000000..4e4744ddc57a6
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -0,0 +1,63 @@
+// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file %s | FileCheck %s
+
+gpu.module @test {
+gpu.func @empty() {
+ gpu.return
+}
+}
+
+// CHECK-LABEL: gpu.func @empty() {
+// CHECK-NEXT: gpu.return
+// CHECK-NEXT: }
+
+// -----
+gpu.module @test {
+gpu.func @gemm(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = xegpu.dpas %2, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
+
+// CHECK-LABEL: gpu.func @gemm(
+// CHECK: %[[ARG0:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<16x16xf16>,
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<8x16xf32>) {
+// CHECK: %[[LANEID:.*]] = gpu.lane_id
+// CHECK-NEXT: gpu.warp_execute_on_lane_0(%[[LANEID]])[16]
+// CHECK-SAME: args(%[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) {
+// CHECK: ^bb0(%[[ARG3:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG4:[a-zA-Z0-9]+]]: memref<16x16xf16>,
+// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: memref<8x16xf32>):
+// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG3]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG4]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T1]][{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[T2]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+// CHECK-NEXT: %[[T5:.*]] = xegpu.dpas %[[T3]], %[[T4]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK-NEXT: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG5]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T5]], %[[T6]][%{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: gpu.return
+
+// -----
+gpu.module @test {
+gpu.func @already_in_warp_op() {
+ %laneid = gpu.lane_id
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ "some_op"() : () -> ()
+ gpu.yield
+ }
+ gpu.return
+}
+}
+
+// CHECK-LABEL: gpu.func @already_in_warp_op() {
+// CHECK: %[[LANEID:.*]] = gpu.lane_id
+// CHECK: gpu.warp_execute_on_lane_0(%[[LANEID]])[16] {
+// CHECK: "some_op"() : () -> ()
+// CHECK: }
+// CHECK: gpu.return
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 7176f9b946a17..1869a9af34493 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -342,6 +343,7 @@ void registerTestXeGPULowerings() {
PassRegistration<TestXeGPUUnrollingPatterns>();
PassRegistration<TestXeGPULayoutInterface>();
PassRegistration<TestXeGPUSGDistribute>();
+ PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>();
}
} // namespace test
} // namespace mlir
>From 65bac235cafcacad1e99a892a803e8a49d9c3ec8 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 8 Oct 2025 19:22:35 +0000
Subject: [PATCH 3/5] fix
---
mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index 4e4744ddc57a6..d289d73e863c7 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
gpu.module @test {
gpu.func @empty() {
>From 268c54557864816594a55283da3eab1e656a6376 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 8 Oct 2025 20:20:44 +0000
Subject: [PATCH 4/5] add more tests
---
.../XeGPU/subgroup-distribute-unit.mlir | 83 ++++++++++++++++++-
1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 40b66d18cc47f..f233dff609f2b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -530,7 +530,7 @@ gpu.module @xevm_module{
// CHECK-NEXT: }
// CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
gpu.module @xevm_module{
- gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) {
+ gpu.func @vector_transpose(%laneid: index) {
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
%cst = "some_op"()
{layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
@@ -556,7 +556,7 @@ gpu.module @xevm_module{
// CHECK: }
// CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
gpu.module @xevm_module{
- gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) {
+ gpu.func @vector_bitcast(%laneid: index) {
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
%cst = "some_op"()
{layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
@@ -573,3 +573,82 @@ gpu.module @xevm_module{
gpu.return
}
}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing
+// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
+// CHECK: gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32>
+// CHECK: }
+// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32>
+gpu.module @xevm_module {
+ gpu.func @vector_shapecast_rank_increasing(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+ : () -> (vector<16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16xf32> to vector<1x16xf32>
+ gpu.yield %cast : vector<1x16xf32>
+ }
+ "some_user_op"(%r) : (vector<1x1xf32>) -> ()
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing(
+// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) {
+// CHECK: gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32>
+// CHECK: }
+// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32>
+gpu.module @xevm_module {
+ gpu.func @vector_shapecast_rank_reducing(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> (vector<1x16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+ }
+ : vector<1x16xf32> to vector<16xf32>
+ gpu.yield %cast : vector<16xf32>
+ }
+ "some_user_op"(%r) : (vector<1xf32>) -> ()
+ gpu.return
+ }
+}
+
+// -----
+// NOTE: Layouts are still valid, but distribution still requires a slice layout for the operand.
+//
+// CHECK-LABEL: gpu.func @vector_shapecast_unsupported
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>) {
+// CHECK: %[[T1:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<1x16xf32>
+// CHECK: gpu.yield %[[T1]] : vector<1x16xf32>
+// CHECK: }
+// CHECK: "some_user_op"(%[[W]]) : (vector<1x1xf32>) -> ()
+// CHECK: gpu.return
+gpu.module @xevm_module {
+ gpu.func @vector_shapecast_unsupported(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> }
+ : () -> (vector<16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16xf32> to vector<1x16xf32>
+ gpu.yield %cast : vector<1x16xf32>
+ }
+ "some_user_op"(%r) : (vector<1x1xf32>) -> ()
+ gpu.return
+ }
+}
>From b6b149c777738f2571542a24f4c332733e6b0333 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 9 Oct 2025 21:54:44 +0000
Subject: [PATCH 5/5] remove header
---
mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 1869a9af34493..76d461108b296 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -14,7 +14,6 @@
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
-#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
More information about the Mlir-commits
mailing list