[Mlir-commits] [mlir] [mlir][affine] add remove-single-iteration-loop pass. (PR #129270)

Tue Oct 21 01:39:08 PDT 2025

linuxlonelyeagle wrote:

A long time has passed since this PR was proposed, and I believe I now have sufficient reasons to bring it up again.
>From my observation, the SCF dialect incorporates the promote-single-iter functionality into canonicalize, whereas the Affine dialect places it in a separate pass.

In addition to this, I hope to be able to optimize the following code. Currently, both the Affine and SCF dialects still calculate the trip count through numerical computation.In this code, the fourth loop will actually run only once, but there is currently no suitable pattern to eliminate it, the current code cannot determine that it only executes once.Even though it is in the SCF dialect, I still think it might be possible to achieve this using ValueBoundsConstraintSet.Can we implement a generic interface that both the Affine and SCF dialects can use? 
cc: @krzysz00 @ftynse @joker-eph 😘
```
#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
module {
  func.func @pooling_ncw_max_tensor(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
    %c16 = arith.constant 16 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %dim = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
    %dim_0 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
    %dim_1 = tensor.dim %arg1, %c0 : tensor<?xf32>
    %dim_2 = tensor.dim %arg2, %c2 : tensor<?x?x?xf32>
    %0 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg2) -> (tensor<?x?x?xf32>) {
      %1 = scf.for %arg5 = %c0 to %dim_0 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x?x?xf32>) {
        %2 = scf.for %arg7 = %c0 to %dim_2 step %c1 iter_args(%arg8 = %arg6) -> (tensor<?x?x?xf32>) {
          %3 = affine.min #map(%arg5)[%dim_0]
          %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7] [1, %3, %dim_1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<1x?x?xf32>
          %extracted_slice_3 = tensor.extract_slice %arg1[0] [%dim_1] [1] : tensor<?xf32> to tensor<?xf32>
          %extracted_slice_4 = tensor.extract_slice %arg8[%arg3, %arg5, %arg7] [1, %3, 1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<1x?x1xf32>
          %4 = linalg.fill ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x?x1xf32>) -> tensor<1x?x1xf32>
          %5 = scf.for %arg9 = %c0 to %3 step %c16 iter_args(%arg10 = %4) -> (tensor<1x?x1xf32>) {
            %6 = scf.for %arg11 = %c0 to %dim_1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x?x1xf32>) {
              %7 = affine.min #map(%arg9)[%3]
              %extracted_slice_5 = tensor.extract_slice %extracted_slice[0, %arg9, %arg11] [1, %7, 1] [1, 1, 1] : tensor<1x?x?xf32> to tensor<1x?x1xf32>
              %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[%arg11] [1] [1] : tensor<?xf32> to tensor<1xf32>
              %extracted_slice_7 = tensor.extract_slice %arg12[0, %arg9, 0] [1, %7, 1] [1, 1, 1] : tensor<1x?x1xf32> to tensor<1x?x1xf32>
              %8 = linalg.pooling_ncw_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x?x1xf32>, tensor<1xf32>) outs(%extracted_slice_7 : tensor<1x?x1xf32>) -> tensor<1x?x1xf32>
              %inserted_slice_8 = tensor.insert_slice %8 into %arg12[0, %arg9, 0] [1, %7, 1] [1, 1, 1] : tensor<1x?x1xf32> into tensor<1x?x1xf32>
              scf.yield %inserted_slice_8 : tensor<1x?x1xf32>
            }
            scf.yield %6 : tensor<1x?x1xf32>
          }
          %inserted_slice = tensor.insert_slice %5 into %arg8[%arg3, %arg5, %arg7] [1, %3, 1] [1, 1, 1] : tensor<1x?x1xf32> into tensor<?x?x?xf32>
          scf.yield %inserted_slice : tensor<?x?x?xf32>
        }
        scf.yield %2 : tensor<?x?x?xf32>
      }
      scf.yield %1 : tensor<?x?x?xf32>
    }
    return %0 : tensor<?x?x?xf32>
  }
  module attributes {transform.with_named_sequence} {
    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
      %0 = transform.structured.match ops{["linalg.pooling_ncw_max"]} in %arg0 : (!transform.any_op) -> !transform.any_op
      %transformed, %loops:3 = transform.structured.fuse %0 tile_sizes [1, 16, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
      %tiled_linalg_op, %loops_0:4 = transform.structured.tile_using_for %transformed tile_sizes [1, 16, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
      transform.yield 
    }
  }
}
```


https://github.com/llvm/llvm-project/pull/129270