[Mlir-commits] [mlir] [mlir][SCF] Modernize `coalesceLoops` method to handle `scf.for` loops with iter_args (PR #87019)

Thu Mar 28 18:25:41 PDT 2024

================
@@ -96,3 +101,150 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @tensor_loops(%arg0 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> tensor<?x?xf32> {
+  %0 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg1 = %arg0) -> tensor<?x?xf32> {
+    %1 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg2 = %arg1) -> tensor<?x?xf32> {
+      %2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg3 = %arg2) -> tensor<?x?xf32> {
+        %3 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>)
+        scf.yield %3 : tensor<?x?xf32>
+      }
+      scf.yield %2 : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  } {coalesce}
+  return %0 : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1, s2] -> ((-s0 + s1) ceildiv s2)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> ((((-s0 + s1) ceildiv s2) * ((-s3 + s4) ceildiv s5)) * ((-s6 + s7) ceildiv s8))>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
+//      CHECK: func.func @tensor_loops(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[LB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
+//  CHECK-DAG:   %[[NEWUB0:.+]] = affine.apply #[[MAP]]()[%[[LB0]], %[[UB0]], %[[STEP0]]]
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1
+//  CHECK-DAG:   %[[NEWUB1:.+]] = affine.apply #[[MAP]]()[%[[LB1]], %[[UB1]], %[[STEP1]]]
+//  CHECK-DAG:   %[[NEWUB2:.+]] = affine.apply #[[MAP]]()[%[[LB2]], %[[UB2]], %[[STEP2]]]
+//  CHECK-DAG:   %[[NEWUB:.+]] = affine.apply #[[MAP1]]()[%[[LB0]], %[[UB0]], %[[STEP0]], %[[LB1]], %[[UB1]], %[[STEP1]], %[[LB2]], %[[UB2]], %[[STEP2]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]])
+//      CHECK:     %[[DELINEARIZE:.+]]:3 = affine.delinearize_index %[[IV]] into (%[[NEWUB0]], %[[NEWUB1]], %[[NEWUB2]])
+//      CHECK:     %[[K:.+]] = affine.apply #[[MAP3]]()[%[[DELINEARIZE]]#2, %[[STEP2]], %[[LB2]]]
+//      CHECK:     %[[J:.+]] = affine.apply #[[MAP3]]()[%[[DELINEARIZE]]#1, %[[STEP1]], %[[LB1]]]
+//      CHECK:     %[[I:.+]] = affine.apply #[[MAP3]]()[%[[DELINEARIZE]]#0, %[[STEP0]], %[[LB0]]]
+//      CHECK:     %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]])
+//      CHECK:     scf.yield %[[USE]]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+// Coalesce only first two loops, but not the last since the iter_args dont line up
+func.func @tensor_loops(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+      %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg5, %arg7 = %arg4) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+        %3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+        scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
+      }
+      scf.yield %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xf32>
+    }
+    scf.yield %1#0, %1#1 : tensor<?x?xf32>, tensor<?x?xf32>
+  } {coalesce}
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
----------------
qedawkins wrote:

ditto: Can you add a CHECK-LABEL here and below? It helps debugging confusing CHECK-NOT related lit test failures (and you can drop the checks for `transform.named_sequence` as well, because I assume that's why those checks are there).

https://github.com/llvm/llvm-project/pull/87019