[Mlir-commits] [mlir] [mlir] Allow unroll & jam on SCF loops with results (PR #98887)

Mon Aug 12 22:47:16 PDT 2024

================
@@ -336,6 +336,119 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: @loop_unroll_and_jam_loop_with_results
+func.func @loop_unroll_and_jam_loop_with_results() -> index {
+  // CHECK:           %[[C0:.*]] = arith.constant 0
+  // CHECK:           %[[UB:.*]] = arith.constant 40
+  // CHECK:           %[[STEP:.*]] = arith.constant 8
+  %c0 = arith.constant 0 : index
+  %c40 = arith.constant 40 : index
+  %c2 = arith.constant 2 : index
+  // CHECK:           %[[RES:.*]]:4 = scf.for %[[I:.*]] = %[[C0]] to %[[UB]] step %[[STEP]]
+  // CHECK-SAME:       iter_args(%[[ARG0:.*]] = %[[C0]], %[[ARG1:.*]] = %[[C0]],
+  // CHECK-SAME                  %[[ARG2:.*]] = %[[C0]], %[[ARG3:.*]] = %[[C0]])
+  %sum = scf.for %i = %c0 to %c40 step %c2 iter_args(%does_not_alias_aggregated = %c0) -> (index) {
+    %sum = arith.addi %i, %i : index
+    // CHECK:         scf.yield %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index, index
+    scf.yield %sum : index
+  }
+  return %sum : index
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
+    transform.loop.unroll_and_jam %1 { factor = 4 } : !transform.op<"scf.for">
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @unroll_jam_tiled_loops
+func.func @unroll_jam_tiled_loops(%A : tensor<8x16x4x8xbf16>, %B : tensor<16x8x8x4xbf16>) -> tensor<16x16x4x4xf32> {
+  // CHECK:      %[[C2:.*]] = arith.constant 2 : index
+  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
+  // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:      %[[C8:.*]] = arith.constant 8 : index
+  // CHECK:      %[[C16:.*]] = arith.constant 16 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c16 = arith.constant 16 : index
+  %c0_f32 = arith.constant 0.0 : f32
+  %buf = memref.alloc() : memref<16x16x4x4xf32>
+  %ten = bufferization.to_tensor %buf restrict writable : memref<16x16x4x4xf32>
+  // CHECK:      %[[AT:.*]] = linalg.fill {{.*}} -> tensor<16x16x4x4xf32>
+  %acc = linalg.fill ins(%c0_f32 : f32) outs(%ten : tensor<16x16x4x4xf32>) -> tensor<16x16x4x4xf32>
+  // CHECK:      %[[L0R:.*]]:2 = scf.for %{{.*}} = %[[C0]] to %[[C16]] step %[[C2]]
+  // CHECK-SAME:     iter_args(%[[L0IA0:.*]] = %[[AT]], %[[L0IA1:.*]] = %[[AT]])
+  %l0r = scf.for %i = %c0 to %c16 step %c1 iter_args(%acl0l1 = %acc) -> (tensor<16x16x4x4xf32>) {
+    // CHECK:        %[[L1R:.*]]:4 = scf.for %{{.*}} = %[[C0]] to %[[C16]] step %[[C2]]
+    // CHECK-SAME:       iter_args(%[[L1IA0:.*]] = %[[L0IA0]], %[[L1IA1:.*]] = %[[L0IA0]],
+    // CHECK-SAME:                 %[[L1IA2:.*]] = %[[L0IA1]], %[[L1IA3:.*]] = %[[L0IA1]])
+    %l1r = scf.for %j = %c0 to %c16 step %c1 iter_args(%acl1l2 = %acl0l1) -> (tensor<16x16x4x4xf32>) {
+      // CHECK:          %[[L2R:.*]]:4 = scf.for %{{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
+      // CHECK-SAME:         iter_args(%[[L2IA0:.*]] = %[[L1IA0]], %[[L2IA1:.*]] = %[[L1IA1]],
+      // CHECK-SAME:                   %[[L2IA2:.*]] = %[[L1IA2]], %[[L2IA3:.*]] = %[[L1IA3]])
+      %l2r = scf.for %k = %c0 to %c8 step %c1 iter_args(%C = %acl1l2) -> (tensor<16x16x4x4xf32>) {
+        %ta = tensor.extract_slice %A[%k, %i, 0, 0] [1, 1, 4, 8] [1, 1, 1, 1] : tensor<8x16x4x8xbf16> to tensor<1x1x4x8xbf16>
+        %tb = tensor.extract_slice %B[%j, %k, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : tensor<16x8x8x4xbf16> to tensor<1x1x8x4xbf16>
+        %tc = tensor.extract_slice %C[%j, %i, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<16x16x4x4xf32> to tensor<1x1x4x4xf32>
+        %rr = linalg.generic {
+                    indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>,
+                                     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>,
+                                     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>],
+                    iterator_types = ["parallel", "parallel", "reduction",
+                                      "parallel", "parallel", "reduction"]}
+                ins(%ta, %tb : tensor<1x1x4x8xbf16>, tensor<1x1x8x4xbf16>)
+                outs(%tc : tensor<1x1x4x4xf32>) {
+              ^bb0(%ia: bf16, %ib: bf16, %out: f32):
+                %0 = arith.extf %ia : bf16 to f32
+                %1 = arith.extf %ib : bf16 to f32
+                %2 = arith.mulf %0, %1 : f32
+                %3 = arith.addf %out, %2 : f32
+                linalg.yield %3 : f32
+        } -> tensor<1x1x4x4xf32>
+        %is = tensor.insert_slice %rr into %C[%j, %i, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<1x1x4x4xf32> into tensor<16x16x4x4xf32>
+        // CHECK:            scf.yield %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :
+        // CHECK-SAME:           tensor<16x16x4x4xf32>, tensor<16x16x4x4xf32>,
+        // CHECK-SAME:           tensor<16x16x4x4xf32>, tensor<16x16x4x4xf32>
+        scf.yield %is : tensor<16x16x4x4xf32>
+      }
+      // CHECK:          scf.yield %[[L2R]]#0, %[[L2R]]#1, %[[L2R]]#2, %[[L2R]]#3 :
+      // CHECK-SAME:         tensor<16x16x4x4xf32>, tensor<16x16x4x4xf32>,
+      // CHECK-SAME:         tensor<16x16x4x4xf32>, tensor<16x16x4x4xf32>
+      scf.yield %l2r : tensor<16x16x4x4xf32>
+    }
+    // CHECK:        scf.yield %[[L1R]]#0, %[[L1R]]#2 :
+    // CHECK-SAME:       tensor<16x16x4x4xf32>, tensor<16x16x4x4xf32>
+    scf.yield %l1r : tensor<16x16x4x4xf32>
+  }
+  // CHECK:      return %[[L0R]]#0 : tensor<16x16x4x4xf32>
+  return %l0r : tensor<16x16x4x4xf32>
+}
----------------
ftynse wrote:

Could we minimize this test? I understand it comes from a real use case, but it looks obnoxiously long and therfore brittle. Using a unary operation (negate or cast) on a 3d tensor will already be an improvement.

https://github.com/llvm/llvm-project/pull/98887