[Mlir-commits] [mlir] 632c5a3 - [MLIR][XeGPU] Relax the slice layout check for broadcast operand in subgroup distribution (#181935)

Tue Feb 17 23:00:11 PST 2026

Author: Jianhui Li
Date: 2026-02-17T23:00:06-08:00
New Revision: 632c5a3738b9e9344b8d2f3d66f8aba5ff122f3c

URL: https://github.com/llvm/llvm-project/commit/632c5a3738b9e9344b8d2f3d66f8aba5ff122f3c
DIFF: https://github.com/llvm/llvm-project/commit/632c5a3738b9e9344b8d2f3d66f8aba5ff122f3c.diff

LOG: [MLIR][XeGPU] Relax the slice layout check for broadcast operand in subgroup distribution (#181935)

This PR relaxes the operand layout check in broadcast op in subgroup
distribution. Instead of failing the pattern match, it issues a warning
and proceed the distribution. The layout could be non-slice layout but
still support valid subgroup distribution.

Added: 
    

Modified: 
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
    mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index b8c4a309b8eb2..99c2da386fab6 100644

--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1511,9 +1511,8 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
         // Case 1: source is lower-rank than result.
         bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
         if (!isSliceOf)
-          return rewriter.notifyMatchFailure(
-              warpOp,
-              "Broadcast input layout must be a slice of result layout.");
+          broadcastOp.emitWarning()
+              << "Broadcast input layout must be a slice of result layout.";
       }
       // case 2: source and result have same rank
       if (rankDiff == 0) {

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 645e889d40657..fb23f38b44b46 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1038,28 +1038,39 @@ gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane
 // CHECK-SAME: (%[[ARG0:.*]]: index) {
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1xf16>)
-// CHECK: %[[DEF:.*]] = "some_def"()
-// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF]]
-// CHECK: gpu.yield %[[BCAST_INNER]], %[[DEF]]
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#1 : vector<1xf16> to vector<16x1xf16>
-// CHECK: "some_use"(%[[BCAST]])
-gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%laneid: index) {
-
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+// CHECK: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1x16x1xf16>, vector<1xf16>, vector<16x1xf16>)
+// CHECK: %[[DEF0:.*]] = "some_def"() : () -> vector<16xf16>
+// CHECK: %[[DEF1:.*]] = "some_def"() : () -> vector<16x16xf16>
+// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF0]]
+// CHECK: %[[CAST_INNER:.*]] = vector.shape_cast %[[DEF1]] : vector<16x16xf16> to vector<1x16x16xf16>
+// CHECK: gpu.yield %[[BCAST_INNER]], %[[CAST_INNER]], %[[DEF0]], %[[DEF1]]
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[R]]#3 : vector<16x1xf16> to vector<1x16x1xf16>
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#2 : vector<1xf16> to vector<16x1xf16>
+// CHECK: "some_use"(%[[BCAST]]) : (vector<16x1xf16>) -> ()
+// CHECK: "some_use"(%[[CAST]]) : (vector<1x16x1xf16>) -> ()
+gpu.func  @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane(%laneid: index) {
+
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>, vector<1x16x1xf16>) {
 
     %1 = "some_def"() : () -> vector<16xf16>
+    %3 = "some_def"() : () -> vector<16x16xf16>
 
     %2 = vector.broadcast %1 {
       layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
     } : vector<16xf16> to vector<16x16xf16>
 
-    gpu.yield %2 : vector<16x16xf16>
+    %4 = vector.broadcast %3 {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
+    } : vector<16x16xf16> to vector<1x16x16xf16>
+
+    gpu.yield %2, %4 : vector<16x16xf16>, vector<1x16x16xf16>
   }
-  "some_use"(%r) : (vector<16x1xf16>) -> ()
+  "some_use"(%r#0) : (vector<16x1xf16>) -> ()
+  "some_use"(%r#1) : (vector<1x16x1xf16>) -> ()
   gpu.return
 }