[Mlir-commits] [mlir] [MLIR] [XeGPU] Add distribution patterns for vector transpose, bitcast & mask ops in sg to wi pass (PR #187392)

Fri Mar 27 17:09:27 PDT 2026

================
@@ -461,6 +467,109 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_transpose
+// CHECK:         %[[SRC:.*]] = "some_op"()
+// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
+// CHECK-NEXT:    %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+// CHECK-NEXT:    gpu.return
+gpu.func @vector_transpose() {
+  %cst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+    : () -> (vector<16x2xf32>)
+  %transpose = vector.transpose %cst, [1, 0]
+    {
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x2xf32> to vector<2x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_bitcast
+// CHECK:         %[[SRC:.*]] = "some_op"()
+// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<4x32xi8> to vector<4x2xi8>
+// CHECK-NEXT:    %[[BC:.*]] = vector.bitcast %[[CAST]] : vector<4x2xi8> to vector<4x1xi16>
+// CHECK-NEXT:    gpu.return
+gpu.func @vector_bitcast() {
+  %cst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+    : () -> (vector<4x32xi8>)
+  %bitcast = vector.bitcast %cst
+    {
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<4x32xi8> to vector<4x16xi16>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_mask_1d
+//  CHECK-SAME: (%[[M0:.*]]: index)
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP]]()[%[[LANE_ID]], %[[M0]]]
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//       CHECK:   gpu.return
+gpu.func @create_mask_1d(%m0: index) {
+  %mask = vector.create_mask %m0
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_1d
+//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP1]]()[%[[LANE_ID]]]
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//       CHECK:   gpu.return
+gpu.func @constant_mask_1d() {
+  %mask = vector.constant_mask [4]
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_mask_2d
+//  CHECK-SAME: (%[[M0:.*]]: index, %[[M1:.*]]: index)
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
+//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
+//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
+//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP]]()[%[[REM2]], %[[M0]]]
+//       CHECK:   %[[BOUND1:.*]] = affine.apply #[[$MAP2]]()[%[[REM1]], %[[M1]]]
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[BOUND0]], %[[BOUND1]] : vector<1x2xi1>
+//       CHECK:   gpu.return
+gpu.func @create_mask_2d(%m0: index, %m1: index) {
+  %mask = vector.create_mask %m0, %m1
+    {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
+    : vector<8x4xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_2d
+//   CHECK-DAG:   %[[C2_CONST:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
+//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
+//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
+//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP3]]()[%[[REM2]]]
----------------
Jianhui-Li wrote:

what is #MAP3 and #MAP4? 
I saw at the begining of the file:  
// CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0] -> (-s0 + 2)>
// CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0] -> (s0 * -2 + 3)>
If so, why not just put them close to test? 
Also why you have to use these affine_map? How can I understand the mapping (-s0+2)? It is not very straightforward lowering. 



https://github.com/llvm/llvm-project/pull/187392