[Mlir-commits] [mlir] [mlir][vector] Add support for `vector.multi_reduction` and `vector.shape_cast` distribution. (PR #154438)
Adam Siemieniuk
llvmlistbot at llvm.org
Wed Sep 3 01:58:33 PDT 2025
================
@@ -850,6 +850,85 @@ func.func @vector_reduction_acc(%laneid: index) -> (f32) {
return %r : f32
}
+// -----
+// CHECK-PROP-LABEL: func.func @vector_multi_reduction_col_reduce
+// CHECK-PROP : %[[W:.*]]:2 = gpu.warp_execute_on_lane_0({{.*}})[32] -> (vector<32x2xf32>, vector<2xf32>) {
+// CHECK-PROP : %[[SOURCE:.*]] = "some_def"() : () -> vector<32x64xf32>
+// CHECK-PROP : %[[ACC:.*]] = "some_def"() : () -> vector<64xf32>
+// CHECK-PROP : gpu.yield %[[SOURCE]], %[[ACC]] : vector<32x64xf32>, vector<64xf32>
+// CHECK-PROP : }
+// CHECK-PROP : %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#0
+// CHECK-PROP-SAME : {offsets = [0, 0], sizes = [32, 1], strides = [1, 1]} : vector<32x2xf32> to vector<32x1xf32>
+// CHECK-PROP : %[[COL0CAST:.*]] = vector.shape_cast %[[COL0]] : vector<32x1xf32> to vector<32xf32>
+// CHECK-PROP : %[[ACC0:.*]] = vector.extract %[[W]]#1[0] : f32 from vector<2xf32>
+// CHECK-PROP : %[[REDUCE0:.*]] = vector.reduction <add>, %[[COL0CAST]], %[[ACC0]] : vector<32xf32> into f32
+// CHECK-PROP : %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#0
+// CHECK-PROP-SAME : {offsets = [0, 1], sizes = [32, 1], strides = [1, 1]} : vector<32x2xf32> to vector<32x1xf32>
+// CHECK-PROP : %[[COL1CAST:.*]] = vector.shape_cast %[[COL1]] : vector<32x1xf32> to vector<32xf32>
+// CHECK-PROP : %[[ACC1:.*]] = vector.extract %[[W]]#1[1] : f32 from vector<2xf32>
+// CHECK-PROP : %[[REDUCE1:.*]] = vector.reduction <add>, %[[COL1CAST]], %[[ACC1]] : vector<32xf32> into f32
+// CHECK-PROP : %[[R:.*]] = vector.from_elements %[[REDUCE0]], %[[REDUCE1]] : vector<2xf32>
+// CHECK-PROP : return %[[R]] : vector<2xf32>
+func.func @vector_multi_reduction_col_reduce(%laneid: index) -> vector<2xf32> {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<32x64xf32>)
+ %acc = "some_def"() : () -> (vector<64xf32>)
+ %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<32x64xf32> to vector<64xf32>
+ gpu.yield %1 : vector<64xf32>
+ }
+ return %r : vector<2xf32>
+}
+
+// -----
+// CHECK-PROP-LABEL: func.func @vector_multi_reduction_row_reduce
+// CHECK-PROP-DAG: %[[C16:.*]] = arith.constant 16 : i32
+// CHECK-PROP-DAG: %[[C8:.*]] = arith.constant 8 : i32
+// CHECK-PROP-DAG: %[[C4:.*]] = arith.constant 4 : i32
+// CHECK-PROP-DAG: %[[C2:.*]] = arith.constant 2 : i32
+// CHECK-PROP-DAG: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-PROP-DAG: %[[C32:.*]] = arith.constant 32 : i32
+// CHECK-PROP-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2x1xf32>) {
+// CHECK-PROP: %[[SRC:.*]] = "some_def"() : () -> vector<2x32xf32>
+// CHECK-PROP: gpu.yield %[[SRC]] : vector<2x32xf32>
+// CHECK-PROP: }
+// CHECK-PROP: %[[T1:.*]] = vector.extract %[[W]][0, 0] : f32 from vector<2x1xf32>
+// CHECK-PROP: %[[SR:.*]], %{{.*}} = gpu.shuffle xor %[[T1]], %[[C1]], %[[C32]] : f32
+// CHECK-PROP: %[[T2:.*]] = arith.addf %[[T1]], %[[SR]] : f32
+// CHECK-PROP: %[[SR0:.*]], %{{.*}} = gpu.shuffle xor %[[T2]], %[[C2]], %[[C32]] : f32
+// CHECK-PROP: %[[T3:.*]] = arith.addf %[[T2]], %[[SR0]] : f32
+// CHECK-PROP: %[[SR2:.*]], %{{.*}} = gpu.shuffle xor %[[T3]], %[[C4]], %[[C32]] : f32
+// CHECK-PROP: %[[T4:.*]] = arith.addf %[[T3]], %[[SR2]] : f32
+// CHECK-PROP: %[[SR4:.*]], %{{.*}} = gpu.shuffle xor %[[T4]], %[[C8]], %[[C32]] : f32
+// CHECK-PROP: %[[T5:.*]] = arith.addf %[[T4]], %[[SR4]] : f32
+// CHECK-PROP: %[[SR6:.*]], %{{.*}} = gpu.shuffle xor %[[T5]], %[[C16]], %[[C32]] : f32
+// CHECK-PROP: %[[T6:.*]] = arith.addf %[[T5]], %[[SR6]] : f32
+// CHECK-PROP: %[[R0:.*]] = arith.addf %[[T6]], %[[CST]] : f32
+//
+// CHECK-PROP: %[[T8:.*]] = vector.extract %[[W]][1, 0] : f32 from vector<2x1xf32>
+// CHECK-PROP: %[[SR8:.*]], %{{.*}} = gpu.shuffle xor %[[T8]], %[[C1]], %[[C32]] : f32
+// CHECK-PROP: %[[T9:.*]] = arith.addf %[[T8]], %[[SR8]] : f32
+// CHECK-PROP: %[[SR10:.*]], %{{.*}} = gpu.shuffle xor %[[T9]], %[[C2]], %[[C32]] : f32
+// CHECK-PROP: %[[T10:.*]] = arith.addf %[[T9]], %[[SR10]] : f32
+// CHECK-PROP: %[[SR12:.*]], %{{.*}} = gpu.shuffle xor %[[T10]], %[[C4]], %[[C32]] : f32
+// CHECK-PROP: %[[T11:.*]] = arith.addf %[[T10]], %[[SR12]] : f32
+// CHECK-PROP: %[[SR14:.*]], %{{.*}} = gpu.shuffle xor %[[T11]], %[[C8]], %[[C32]] : f32
+// CHECK-PROP: %[[T12:.*]] = arith.addf %[[T11]], %[[SR14]] : f32
+// CHECK-PROP: %[[SR16:.*]], %{{.*}} = gpu.shuffle xor %[[T12]], %[[C16]], %[[C32]] : f32
+// CHECK-PROP: %[[T13:.*]] = arith.addf %[[T12]], %[[SR16]] : f32
+// CHECK-PROP: %[[R1:.*]] = arith.addf %[[T13]], %[[CST]] : f32
+// CHECK-PROP: %[[R:.*]] = vector.from_elements %[[R0]], %[[R1]] : vector<2xf32>
+// CHECK-PROP: return %[[R]] : vector<2xf32>
+func.func @vector_multi_reduction_row_reduce(%laneid: index) -> vector<2xf32> {
+ %zero = arith.constant dense<0.0> : vector<2xf32>
+ %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<2x32xf32>)
----------------
adam-smnk wrote:
Oh I see, I missed the exact flow. Makes perfect sense 👍
https://github.com/llvm/llvm-project/pull/154438
More information about the Mlir-commits
mailing list