[Mlir-commits] [mlir] cd6e02e - [mlir][Linalg] Retire TestLinalgCodegenStrategy pass.

Nicolas Vasilache llvmlistbot at llvm.org
Wed Jul 13 04:20:49 PDT 2022


Author: Nicolas Vasilache
Date: 2022-07-13T04:20:42-07:00
New Revision: cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9

URL: https://github.com/llvm/llvm-project/commit/cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9
DIFF: https://github.com/llvm/llvm-project/commit/cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9.diff

LOG: [mlir][Linalg] Retire TestLinalgCodegenStrategy pass.

This pass tests patterns that are already tested elsewhere by applying them in a semi-targeted
fashion using anchor function and op names.

>From now on, targeted tests should use the transform dialect interpreter.

Differential Revision: https://reviews.llvm.org/D129627

Added: 
    

Modified: 
    mlir/test/lib/Dialect/Linalg/CMakeLists.txt
    mlir/tools/mlir-opt/mlir-opt.cpp

Removed: 
    mlir/test/Dialect/Linalg/codegen-strategy.mlir
    mlir/test/Dialect/Linalg/decompose-convolution.mlir
    mlir/test/Dialect/Linalg/hoist-padding.mlir
    mlir/test/Dialect/Linalg/interchange.mlir
    mlir/test/Dialect/Linalg/pad.mlir
    mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
    mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
    mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
    mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
    mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp


################################################################################
diff  --git a/mlir/test/Dialect/Linalg/codegen-strategy.mlir b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
deleted file mode 100644
index 05f99635e85c6..0000000000000
--- a/mlir/test/Dialect/Linalg/codegen-strategy.mlir
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=2,4,8 vectorize vectorize-contraction-to=matrixintrinsics unroll-vector-transfers=true" -split-input-file | FileCheck %s --check-prefix=CHECK-INTRINSIC
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 promote promote-full-tile-pad register-tile-sizes=2,4,8 vectorize vectorize-contraction-to=outerproduct split-transfers=true unroll-vector-transfers=false" -split-input-file | FileCheck %s --check-prefix=CHECK-OUTER
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 tile-interchange=1,2,0 generalize iterator-interchange=0,2,1" -split-input-file | FileCheck %s --check-prefix=CHECK-INTERCHANGE
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 hoist-paddings=3,3,0" -split-input-file | FileCheck %s --check-prefix=CHECK-PAD
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 vectorize" -split-input-file | FileCheck %s --check-prefix=CHECK-FUSE
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=conv anchor-op=linalg.conv_2d_nhwc_hwcf tile-sizes=1,1,8,32,1,1,8 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 decompose vectorize vectorize-padding" -split-input-file | FileCheck %s --check-prefix=CHECK-DECOMP
-
-// CHECK-INTRINSIC: func @matmul(
-//     CHECK-OUTER: func @matmul(
-func.func @matmul(%arg0: memref<72x72xf32>, %arg1: memref<72x72xf32>, %arg2: memref<72x72xf32>) {
-
-  // Check the matrix intrinsic lowering is triggered.
-  //      CHECK-INTRINSIC: vector.matrix_multiply
-  // CHECK-INTRINSIC-SAME: {lhs_columns = 8 : i32, lhs_rows = 2 : i32, rhs_columns = 4 : i32}
-  // CHECK-INTRINSIC-SAME: (vector<16xf32>, vector<32xf32>) -> vector<8xf32>
-
-  // Check the outer product lowering is triggered.
-  //          CHECK-OUTER: vector.outerproduct {{.*}} : vector<2xf32>, vector<4xf32>
-  linalg.matmul ins(%arg0, %arg1: memref<72x72xf32>, memref<72x72xf32>) outs(%arg2: memref<72x72xf32>)
-  func.return
-}
-
-// -----
-
-// CHECK-INTERCHANGE: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
-  //  CHECK-INTERCHANGE-DAG: %[[C16:.*]] = arith.constant 16
-  //  CHECK-INTERCHANGE-DAG: %[[C32:.*]] = arith.constant 32
-  //  CHECK-INTERCHANGE-DAG: %[[C64:.*]] = arith.constant 64
-
-  // Check the tile loops are interchanged.
-  //      CHECK-INTERCHANGE: scf.for {{.*}} step %[[C32]]
-  //      CHECK-INTERCHANGE:   scf.for {{.*}} step %[[C64]]
-  //      CHECK-INTERCHANGE:    scf.for {{.*}} step %[[C16]]
-
-  // Check the operation has been generalized and interchanged.
-  //      CHECK-INTERCHANGE:      linalg.generic
-  // CHECK-INTERCHANGE-SAME:      iterator_types = ["parallel", "reduction", "parallel"]
-  %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32>
-  func.return %0 : tensor<72x72xf32>
-}
-
-// -----
-
-//     CHECK-PAD-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 72, 16)>
-
-//         CHECK-PAD: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
-
-  // Check the padding of the input operands has been hoisted out of the tile loop nest.
-  //      CHECK-PAD-COUNT=2: tensor.pad %{{.*}} nofold
-  //              CHECK-PAD: scf.for
-  // Check CSE eliminates the duplicate min operations introduced by tiling.
-  //              CHECK-PAD: affine.min #[[MAP0]]
-  //          CHECK-PAD-NOT: affine.min #[[MAP0]]
-  //      CHECK-PAD-COUNT=2: scf.for
-  //              CHECK-PAD: linalg.matmul
-  %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32>
-  func.return %0 : tensor<72x72xf32>
-}
-
-// -----
-
-//         CHECK-FUSE: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
-
-  // Check the padding and vectorization applies to the fill operation due to the empty anchor op string.
-  //        CHECK-FUSE:  %[[CST:.*]] = arith.constant dense<0.000000e+00>
-  //        CHECK-FUSE:  vector.transfer_write %[[CST]]
-  %cst = arith.constant 0.0 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<72x72xf32>) -> tensor<72x72xf32>
-
-  // Check the matmul is padded and vectorized despite the empty anchor op string.
-  //        CHECK-FUSE:  vector.outerproduct
-  %1 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%0: tensor<72x72xf32>) -> tensor<72x72xf32>
-  func.return %1 : tensor<72x72xf32>
-}
-
-// -----
-
-//         CHECK-DECOMP: func @conv(
-func.func @conv(%arg0: tensor<8x18x17x32xf32>, %arg1: tensor<3x3x32x64xf32>, %arg2: tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32>
-
-  // Check the conv is padded by a rank-reducing vector transfer op pair.
-  //        CHECK-DECOMP:  vector.transfer_read {{.*}}: tensor<1x1x?x8xf32>, vector<1x8x8xf32>
-  //        CHECK-DECOMP:  vector.outerproduct
-  //        CHECK-DECOMP:  vector.transfer_write {{.*}}: vector<1x8x32xf32>, tensor<1x1x?x32xf32>
-  %1 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x17x32xf32>, tensor<3x3x32x64xf32>) outs(%0 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32>
-  func.return %1 : tensor<8x16x15x64xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/decompose-convolution.mlir b/mlir/test/Dialect/Linalg/decompose-convolution.mlir
deleted file mode 100644
index ad900a568c709..0000000000000
--- a/mlir/test/Dialect/Linalg/decompose-convolution.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: mlir-opt -test-linalg-codegen-strategy="decompose" -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor
-//  CHECK-SAME: (%[[INPUT:.+]]: tensor<4x1x6x3xf32>, %[[FILTER:.+]]: tensor<1x2x3x8xf32>, %[[INIT:.+]]: tensor<4x1x2x8xf32>)
-func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x1x6x3xf32>, %filter: tensor<1x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> {
-  %0 = linalg.conv_2d_nhwc_hwcf
-    {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>}
-    ins(%input, %filter : tensor<4x1x6x3xf32>, tensor<1x2x3x8xf32>)
-    outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32>
-  return %0 : tensor<4x1x2x8xf32>
-}
-
-//               CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]]
-// CHECK-SAME{LITERAL}:   [0, 0, 0, 0] [4, 1, 6, 3] [1, 1, 1, 1] : tensor<4x1x6x3xf32> to tensor<4x6x3xf32>
-//               CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]]
-// CHECK-SAME{LITERAL}:   [0, 0, 0, 0] [1, 2, 3, 8] [1, 1, 1, 1] : tensor<1x2x3x8xf32> to tensor<2x3x8xf32>
-//               CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]]
-// CHECK-SAME{LITERAL}:   [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x1x2x8xf32> to tensor<4x2x8xf32>
-//               CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf
-//          CHECK-SAME:     dilations = dense<3> : vector<1xi64>
-//          CHECK-SAME:     strides = dense<2> : vector<1xi64>
-//          CHECK-SAME:   ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor<4x6x3xf32>, tensor<2x3x8xf32>)
-//          CHECK-SAME:   outs(%[[INIT_1D]] : tensor<4x2x8xf32>)
-//               CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]]
-// CHECK-SAME{LITERAL}:   [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x2x8xf32> into tensor<4x1x2x8xf32>
-//               CHECK: return %[[CONV_2D]]
-// -----
-
-// CHECK-LABEL: func @conv2d_nhwc_qxqx1xq_tensor
-//  CHECK-SAME: (%[[INPUT:.+]]: tensor<?x?x1x?xf32>, %[[FILTER:.+]]: tensor<?x1x?x?xf32>, %[[INIT:.+]]: tensor<?x?x1x?xf32>)
-func.func @conv2d_nhwc_qxqx1xq_tensor(%input: tensor<?x?x1x?xf32>, %filter: tensor<?x1x?x?xf32>, %init: tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32> {
-  %0 = linalg.conv_2d_nhwc_hwcf
-    {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>}
-    ins(%input, %filter : tensor<?x?x1x?xf32>, tensor<?x1x?x?xf32>)
-    outs(%init : tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32>
-  return %0 : tensor<?x?x1x?xf32>
-}
-
-//               CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]]
-//          CHECK-SAME:   [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-//          CHECK-SAME:     tensor<?x?x1x?xf32> to tensor<?x?x?xf32>
-//               CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]]
-//          CHECK-SAME:   [0, 0, 0, 0] [%{{.*}}, 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] :
-//          CHECK-SAME:     tensor<?x1x?x?xf32> to tensor<?x?x?xf32>
-//               CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]]
-//          CHECK-SAME:   [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-//          CHECK-SAME:     tensor<?x?x1x?xf32> to tensor<?x?x?xf32>
-//               CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf
-//          CHECK-SAME:     dilations = dense<2> : vector<1xi64>
-//          CHECK-SAME:     strides = dense<3> : vector<1xi64>
-//          CHECK-SAME:   ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-//          CHECK-SAME:   outs(%[[INIT_1D]] : tensor<?x?x?xf32>)
-//               CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]]
-//          CHECK-SAME:   [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-//          CHECK-SAME:     tensor<?x?x?xf32> into tensor<?x?x1x?xf32>
-//               CHECK: return %[[CONV_2D]]
-
-// -----
-
-// Do not convert convolution ops whose window dimensions are not ones.
-
-// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor
-func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x3x5x3xf32>, %filter: tensor<2x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> {
-  // CHECK: linalg.conv_2d_nhwc_hwcf
-  %0 = linalg.conv_2d_nhwc_hwcf
-    {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-    ins(%input, %filter : tensor<4x3x5x3xf32>, tensor<2x2x3x8xf32>)
-    outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32>
-  return %0 : tensor<4x1x2x8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor
-func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x1x113x96xf32>, %filter: tensor<1x3x96xf32>, %out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32> {
-  //     CHECK: linalg.depthwise_conv_1d_nwc_wc
-  %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
-         ins(%input, %filter: tensor<1x1x113x96xf32>, tensor<1x3x96xf32>)
-         outs(%out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32>
-  return %0: tensor<1x1x56x96xf32>
-}
-
-// -----
-
-// Do not convert convolution ops whose window dimensions are not ones.
-
-// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor
-func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>, %out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32> {
-  //     CHECK: linalg.depthwise_conv_2d_nhwc_hwc
-  %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
-         ins(%input, %filter: tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
-         outs(%out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
-  return %0: tensor<1x56x56x96xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
deleted file mode 100644
index 5ac26232d9c0e..0000000000000
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ /dev/null
@@ -1,480 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATVEC
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 transpose-paddings=[1,0],[0],[0] run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=TRANSP
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL
-
-//  MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-
-//      MATVEC:  static_size_divisible
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @static_size_divisible(%arg0: tensor<24x12xf32>,
-                            %arg1: tensor<12xf32>,
-                            %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c4 = arith.constant 4 : index
-
-  // Pack the vector tiles for all values of IV (IVx4).
-  //      MATVEC:  = linalg.init_tensor [3, 4]
-  //      MATVEC:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
-  //        MATVEC:   %[[PIDX0:.*]] = affine.apply #[[DIV4]](%[[PIV0]])
-  //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [4]
-  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]
-  //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Index the packed vector.
-    //  MATVEC-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]])
-    //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
-    %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = tensor.pad %2 nofold low[%c0] high[%c0]  {
-    ^bb0(%arg5: index):
-      tensor.yield %cst : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the packed input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T4]]
-    %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %4 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 12, 5)>
-// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 5)>
-// MATVEC-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
-#map0 = affine_map<(d0) -> (5, -d0 + 12)>
-#map1 = affine_map<(d0) -> (-d0 + 5)>
-
-//      MATVEC:  static_size_not_divisible
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @static_size_not_divisible(%arg0: tensor<24x12xf32>,
-                                %arg1: tensor<12xf32>,
-                                %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c5 = arith.constant 5 : index
-
-  // Pack the vector tiles for all values of IV (IVx5).
-  //      MATVEC:  = linalg.init_tensor [3, 5]
-  //      MATVEC:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
-  //        MATVEC:   %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
-  //        MATVEC:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])
-  //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
-  //        MATVEC:   %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
-  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
-  //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %1 = affine.min #map0(%arg3)
-    %2 = tensor.extract_slice %arg0[0, %arg3] [24, %1] [1, 1] : tensor<24x12xf32> to tensor<24x?xf32>
-
-    // Index the packed vector.
-    //  MATVEC-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV0]])
-    //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
-    %3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor<?xf32>
-    %4 = affine.apply #map1(%1)
-    %5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4]  {
-    ^bb0(%arg5: index, %arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<24x?xf32> to tensor<24x5xf32>
-    %6 = tensor.pad %3 low[%c0] high[%4]  {
-    ^bb0(%arg5: index):
-      tensor.yield %cst : f32
-    } : tensor<?xf32> to tensor<5xf32>
-
-    // Check matvec uses the packed input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T4]]
-    %7 = linalg.matvec ins(%5, %6 : tensor<24x5xf32>, tensor<5xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %7 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC-DAG: #[[SDIV4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
-// MATVEC-DAG: #[[DDIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
-// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 4)>
-#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map1 = affine_map<(d0) -> (-d0 + 4)>
-
-//      MATVEC:  dynamic_size
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?xf32>
-func.func @dynamic_size(%arg0: tensor<24x?xf32>,
-                   %arg1: tensor<?xf32>,
-                   %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-
-  //      MATVEC:  %[[D0:.*]] = tensor.dim
-  %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32>
-
-  // Pack the vector tiles for all values of IV (IVx4).
-  //      MATVEC:  %[[PS0:.*]] = affine.apply #[[SDIV4]]()[%[[D0]]]
-  //      MATVEC:  = linalg.init_tensor [%[[PS0]], 4]
-  //      MATVEC:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
-  //        MATVEC:   %[[PIDX0:.*]] = affine.apply #[[DDIV4]](%[[PIV0]])
-  //        MATVEC:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])[%[[D0]]]
-  //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
-  //        MATVEC:   %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
-  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
-  //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %2 = affine.min #map0(%arg3)[%0]
-    %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>
-
-    // Index the packed vector.
-    //  MATVEC-DAG:   %[[IDX0:.*]] = affine.apply #[[DDIV4]](%[[IV0]])
-    //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
-    %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
-    %5 = affine.apply #map1(%2)
-    %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5]  {
-    ^bb0(%arg5: index, %arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<24x?xf32> to tensor<24x4xf32>
-    %7 = tensor.pad %4 nofold low[%c0] high[%5]  {
-    ^bb0(%arg5: index):
-      tensor.yield %cst : f32
-    } : tensor<?xf32> to tensor<4xf32>
-
-    // Check matvec uses the packed input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T4]]
-    %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %8 : tensor<24xf32>
-  }
-  return %1 : tensor<24xf32>
-}
-
-// -----
-
-//      MATVEC:  non_constant_padding
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @non_constant_padding(%arg0: tensor<24x12xf32>,
-                   %arg1: tensor<12xf32>,
-                   %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %c4 = arith.constant 4 : index
-  %c12 = arith.constant 12 : index
-  %c0 = arith.constant 0 : index
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Check the non constant padding is not hoisted.
-    //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]
-    %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = tensor.pad %2 nofold low[%c0] high[%c0]  {
-    ^bb0(%arg5: index):
-      %5 = arith.index_cast %arg3 : index to i32
-      %6 = arith.sitofp %5 : i32 to f32
-      tensor.yield %6 : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the padded input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T1]]
-    %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %4 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-//      MATVEC:  non_constant_op_padding
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @non_constant_op_padding(%arg0: tensor<24x12xf32>,
-                      %arg1: tensor<12xf32>,
-                      %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c4 = arith.constant 4 : index
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Check the non constant op padding is not hoisted.
-    //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]
-    //        MATVEC:  tensor.yield %[[V0]]
-    %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = tensor.extract %arg1[%arg3] : tensor<12xf32>
-    %4 = tensor.pad %2 nofold low[%c0] high[%c0]  {
-    ^bb0(%arg5: index):
-      tensor.yield %3 : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the padded input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T1]]
-    %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %5 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-//      MATVEC:  non_index_operand
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: i32
-func.func @non_index_operand(%arg0: tensor<24x12xf32>,
-                        %arg1: tensor<12xf32>,
-                        %arg2: tensor<24xf32>,
-                        %arg3: i32) -> tensor<24xf32> {
-  %c4 = arith.constant 4 : index
-  %c12 = arith.constant 12 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Check the index_cast prevents hoisting due to its non index operand.
-    //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[IDX0:.*]] = arith.index_cast %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
-    %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = arith.index_cast %arg3 : i32 to index
-    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
-    ^bb0(%arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the padded input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T1]]
-    %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %5 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-//      MATVEC:  memory_effect
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: memref<?xindex>
-func.func @memory_effect(%arg0: tensor<24x12xf32>,
-                    %arg1: tensor<12xf32>,
-                    %arg2: tensor<24xf32>,
-                    %arg3: memref<?xindex>) -> tensor<24xf32> {
-  %c4 = arith.constant 4 : index
-  %c12 = arith.constant 12 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Check the load prevents hoisting due to its memory effect.
-    //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[IDX0:.*]] = memref.load %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
-    %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = memref.load %arg3[%c0] : memref<?xindex>
-    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
-    ^bb0(%arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the padded input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T1]]
-    %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %5 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-//      MATVEC:  index_result_loop
-// MATVEC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: index
-func.func @index_result_loop(%arg0: tensor<24x12xf32>,
-                        %arg1: tensor<12xf32>,
-                        %arg2: tensor<24xf32>,
-                        %arg3: index) -> tensor<24xf32> {
-  %c4 = arith.constant 4 : index
-  %c12 = arith.constant 12 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
-    %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
-    // Check the unexpected operation with a region prevents hoisting.
-    //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
-    %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) {
-      %6 = arith.addi %arg3, %arg7 : index
-      scf.yield %6 : index
-    }
-    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
-    ^bb0(%arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<4xf32> to tensor<4xf32>
-
-    // Check matvec uses the padded input vector.
-    //      MATVEC:  = linalg.matvec ins(%{{.*}}, %[[T1]]
-    %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %5 : tensor<24xf32>
-  }
-  return %0 : tensor<24xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (-d0 + 12, 5)>
-#map1 = affine_map<(d0) -> (-d0 + 5)>
-
-//      MATMUL:  tile_and_fuse
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<12x6xf32>
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<6x24xf32>
-func.func @tile_and_fuse(%arg0: tensor<12x6xf32>,
-                    %arg1: tensor<6x24xf32>,
-                    %arg2: tensor<12x24xf32>) -> tensor<12x24xf32> {
-  %c6 = arith.constant 6 : index
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c5 = arith.constant 5 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  // Check the second input operand is hoisted by two loop nests.
-  //      MATMUL:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
-  //        MATMUL:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  //        MATMUL:   %[[T2:.*]] = tensor.pad %[[T1]]
-
-  //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<12x24xf32>) {
-    %1 = affine.min #map0(%arg3)
-
-    // Check the extract_slice op introduced by the double tiling does not prevent the hoisting.
-    %2 = tensor.extract_slice %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<12x24xf32> to tensor<?x24xf32>
-    %3 = affine.apply #map1(%1)
-
-    // Check the fused and padded fill op does not prevent hoisting.
-    %4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0]  {
-    ^bb0(%arg5: index, %arg6: index):
-      tensor.yield %cst : f32
-    } : tensor<?x24xf32> to tensor<5x24xf32>
-    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<5x24xf32>) -> tensor<5x24xf32>
-    %6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
-
-    // Check the first input operand is hoisted by one loop nest.
-    //      MATMUL:  %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
-    //        MATMUL:   %[[T4:.*]] = tensor.extract_slice %[[ARG0]]
-    //        MATMUL:   %[[T5:.*]] = tensor.pad %[[T4]]
-
-    //      MATMUL:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
-    %7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor<?x24xf32>) {
-
-      // Index the packed operands.
-      //    MATMUL-DAG:   %[[T6:.*]] = tensor.extract_slice %[[T3]]
-      //    MATMUL-DAG:   %[[T7:.*]] = tensor.extract_slice %[[T0]]
-      %9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor<?x3xf32>
-      %10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32>
-      %11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> to tensor<?x24xf32>
-      %12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0]  {
-      ^bb0(%arg7: index, %arg8: index):
-        tensor.yield %cst : f32
-      } : tensor<?x3xf32> to tensor<5x3xf32>
-      %13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0]  {
-      ^bb0(%arg7: index, %arg8: index):
-        tensor.yield %cst : f32
-      } : tensor<3x24xf32> to tensor<3x24xf32>
-
-      // Check the output padding is not hoisted.
-      //      MATMUL:   %[[T8:.*]] = tensor.pad
-      %14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0]  {
-      ^bb0(%arg7: index, %arg8: index):
-        tensor.yield %cst : f32
-      } : tensor<?x24xf32> to tensor<5x24xf32>
-
-      // Check matmul uses the padded operands.
-      //      MATMUL:   = linalg.matmul ins(%[[T6]], %[[T7]] {{.*}} outs(%[[T8]]
-      %15 = linalg.matmul ins(%12, %13 : tensor<5x3xf32>, tensor<3x24xf32>) outs(%14 : tensor<5x24xf32>) -> tensor<5x24xf32>
-      %16 = tensor.extract_slice %15[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
-      %17 = tensor.insert_slice %16 into %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<?x24xf32>
-      scf.yield %17 : tensor<?x24xf32>
-    }
-    %8 = tensor.insert_slice %7 into %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<12x24xf32>
-    scf.yield %8 : tensor<12x24xf32>
-  }
-  return %0 : tensor<12x24xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
-#map1 = affine_map<(d0) -> (-d0 + 4)>
-
-//      TRANSP:  transpose
-// TRANSP-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x?xf32>
-func.func @transpose(%arg0: tensor<24x?xf32>,
-                %arg1: tensor<?xf32>,
-                %arg2: tensor<24xf32>) -> tensor<24xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32>
-
-  // Transpose the padded matrix.
-  //      TRANSP:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = {{.*}}iter_args(%[[T1:.*]] =
-  //        TRANSP:   %[[T2:.*]] = tensor.pad
-  //        TRANSP:   %[[T3:.*]] = tensor.extract_slice %[[T1]]
-  //        TRANSP:   %[[T4:.*]] = linalg.generic
-  //   TRANSP-SAME:     ins(%[[T2]] : tensor<24x4xf32>
-  //   TRANSP-SAME:     outs(%[[T3]] : tensor<4x24xf32>
-  //        TRANSP:   %[[T5:.*]] = tensor.insert_slice %[[T4]] into %[[T1]]
-  //        TRANSP:   scf.yield %[[T5:.*]]
-
-  //      TRANSP:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
-    %2 = affine.min #map0(%arg3)[%0]
-    %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>
-
-    // Index the packed vector and transpose back.
-    //      TRANSP:   %[[T6:.*]] = tensor.extract_slice %[[T0]]
-    //      TRANSP:   %[[T7:.*]] = linalg.init_tensor
-    //      TRANSP:   %[[T8:.*]] = linalg.generic
-    // TRANSP-SAME:     ins(%[[T6]] : tensor<4x24xf32>
-    // TRANSP-SAME:     outs(%[[T7]] : tensor<24x4xf32>
-    %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
-    %5 = affine.apply #map1(%2)
-    %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5]  {
-    ^bb0(%arg5: index, %arg6: index):  // no predecessors
-      tensor.yield %cst : f32
-    } : tensor<24x?xf32> to tensor<24x4xf32>
-    %7 = tensor.pad %4 nofold low[%c0] high[%5]  {
-    ^bb0(%arg5: index):  // no predecessors
-      tensor.yield %cst : f32
-    } : tensor<?xf32> to tensor<4xf32>
-
-    // Check matvec uses the packed input vector.
-    //      TRANSP:    = linalg.matvec ins(%[[T8]]
-    %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
-    scf.yield %8 : tensor<24xf32>
-  }
-  return %1 : tensor<24xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/interchange.mlir b/mlir/test/Dialect/Linalg/interchange.mlir
deleted file mode 100644
index 1d422eef242b9..0000000000000
--- a/mlir/test/Dialect/Linalg/interchange.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" -test-linalg-codegen-strategy="iterator-interchange=1,3,4,2,0" | FileCheck --check-prefix=CANCEL-OUT %s
-
-#map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
-
-func.func @interchange_generic_op(%arg0 : memref<1x2x3x4x5xindex>, %arg1 : memref<1x2x4xindex>) {
-  linalg.generic {
-    indexing_maps = [#map0, #map1],
-    iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]}
-  ins(%arg0 : memref<1x2x3x4x5xindex>)
-  outs(%arg1 : memref<1x2x4xindex>) {
-      ^bb0(%arg2 : index, %arg3 : index) :
-        %0 = linalg.index 0 : index
-        %1 = linalg.index 1 : index
-        %2 = linalg.index 4 : index
-        %3 = arith.subi %0, %1 : index
-        %4 = arith.addi %3, %2 : index
-        %5 = arith.addi %4, %arg2 : index
-        linalg.yield %5 : index
-      }
-  return
-}
-
-//    CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4, d2, d0)>
-//    CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d2)>
-//        CHECK: func @interchange_generic_op
-//        CHECK:   linalg.generic
-//   CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
-//   CHECK-SAME:     iterator_types = ["reduction", "parallel", "parallel", "parallel", "reduction"]
-//    CHECK-DAG:     %[[IDX0:.+]] = linalg.index 1 : index
-//    CHECK-DAG:     %[[IDX1:.+]] = linalg.index 3 : index
-//    CHECK-DAG:     %[[IDX4:.+]] = linalg.index 0 : index
-//        CHECK:     %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index
-//        CHECK:     %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index
-//        CHECK:     %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index
-
-//  CANCEL-OUT-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-//  CANCEL-OUT-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
-//      CANCEL-OUT: func @interchange_generic_op
-//      CANCEL-OUT:   linalg.generic
-// CANCEL-OUT-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CANCEL-OUT-SAME:     iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]
-//  CANCEL-OUT-DAG:     %[[IDX0:.+]] = linalg.index 0 : index
-//  CANCEL-OUT-DAG:     %[[IDX1:.+]] = linalg.index 1 : index
-//  CANCEL-OUT-DAG:     %[[IDX4:.+]] = linalg.index 4 : index
-//      CANCEL-OUT:     %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index
-//      CANCEL-OUT:     %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index
-//      CANCEL-OUT:     %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index
-
-

diff  --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
deleted file mode 100644
index 0e0e2e1066d6e..0000000000000
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ /dev/null
@@ -1,600 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=MATMUL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,1.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,0.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL-MATMUL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32 pack-paddings=1,1,0 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1 pack-paddings=1,1,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=PARTIAL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.depthwise_conv_2d_nhwc_hwc pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=1,2 pack-paddings=1,0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=DEPTHWISE_CONV_2D
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 12, 7)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (-s0 + 12, 7)>
-
-//      MATMUL:  static_sizes_output_divisible
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
-                                         %arg1: tensor<12x25xf32>,
-                                         %arg2: tensor<24x25xf32>,
-                                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
-  //  MATMUL-DAG: %[[CST:.*]] = arith.constant 0.
-  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
-
-  //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
-  %0 = affine.min #map()[%iv2]
-
-  //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  //      MATMUL:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  //      MATMUL:   %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-  // Check statically sized matmul inputs with partially divisible sizes are padded.
-  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
-  //      MATMUL:   %[[T3:.*]] = tensor.pad %[[T0]] nofold
-  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
-  // MATMUL-SAME:                  [%[[C0]], %[[V0]]
-  //      MATMUL:   tensor.yield %[[CST]]
-  //      MATMUL:   %[[T4:.*]] = tensor.pad %[[T1]] nofold
-
-  // Check the statically sized matmul output with fully divisible sizes is not padded.
-  //      MATMUL:   %[[T5:.*]] = linalg.matmul
-  // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
-  // MATMUL-SAME:                  outs(%[[T2]] : tensor<4x5xf32>)
-  //      MATMUL:   %[[T6:.*]] = tensor.insert_slice %[[T5]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 25, 7)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (-s0 + 25, 7)>
-
-//      MATMUL:  static_sizes_input_divisible
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
-                                        %arg1: tensor<12x25xf32>,
-                                        %arg2: tensor<24x25xf32>,
-                                        %iv0 : index, %iv1 : index, %iv2 : index) ->  tensor<24x25xf32> {
-  //  MATMUL-DAG: %[[CST:.*]] = arith.constant 0.
-  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
-
-  %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>
-
-  //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]]
-  %4 = affine.min #map()[%iv1]
-  %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
-
-  //      MATMUL:   %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
-  %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
-
-  // Check the statically sized matmul output with partially divisible sizes is padded.
-  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
-  //      MATMUL:   %[[T1:.*]] = tensor.pad %[[T0]] low
-  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
-  // MATMUL-SAME:                  [%[[C0]], %[[V0]]
-  //      MATMUL:   tensor.yield %[[CST]]
-
-  //      MATMUL:   %[[T2:.*]] = linalg.matmul
-  // MATMUL-SAME:                  outs(%[[T1]] : tensor<4x7xf32>)
-  //      MATMUL:   %[[T3:.*]] = tensor.extract_slice %[[T2]]
-  //      MATMUL:   %[[T4:.*]] = tensor.insert_slice %[[T3]]
-  %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
-  %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
-
-   //      MATMUL:   return %[[T4]]
-  func.return %8 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 5)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 7)>
-// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 6)>
-// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
-// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>
-
-#map0 = affine_map<()[s0, s1] -> (-s0 + s1, 5)>
-#map1 = affine_map<()[s0, s1] -> (-s0 + s1, 6)>
-#map2 = affine_map<()[s0, s1] -> (-s0 + s1, 7)>
-
-//      MATMUL:  dynamic_sizes
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME:    %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME:    %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @dynamic_sizes(%arg0: tensor<?x?xf32>,
-                         %arg1: tensor<?x?xf32>,
-                         %arg2: tensor<?x?xf32>,
-                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
-  //  MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //  MATMUL-DAG: %[[C1:.*]] = arith.constant 1
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-
-  //  MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-  //  MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-  //  MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-
-  //      MATMUL:   %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]]
-  //      MATMUL:   %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]]
-  //      MATMUL:   %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]]
-  %6 = affine.min #map0()[%iv0, %0]
-  %7 = affine.min #map1()[%iv2, %1]
-  %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %9 = affine.min #map2()[%iv1, %2]
-  %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-
-  // Check all matmul operands are padded.
-  //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
-  //      MATMUL:   %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
-  //      MATMUL:   %[[T3:.*]] = tensor.pad %{{.*}} nofold
-  // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
-  // MATMUL-SAME:                  [%[[V0]], %[[V1]]
-  //      MATMUL:   %[[T4:.*]] = tensor.pad %{{.*}} nofold
-  //      MATMUL:   %[[T5:.*]] = tensor.pad %{{.*}} low
-
-  // Check the dynamic matmul has been erased.
-  //  MATMUL-NOT:   = linalg.matmul {{.*}} tensor<?x?xf32>
-
-  // Check all padded matmul operands are statically sized.
-  //      MATMUL:   %[[T6:.*]] = linalg.matmul
-  // MATMUL-SAME:                  ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
-  // MATMUL-SAME:                  outs(%[[T5]] : tensor<5x7xf32>)
-  //      MATMUL:   %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
-  //      MATMUL:   %[[T8:.*]] = tensor.insert_slice %[[T7]]
-  %12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-  //      MATMUL:   return %[[T8]]
-  func.return %13 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      FILL-MATMUL:  pad_multiple
-// FILL-MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @pad_multiple(%arg0: tensor<64x64xf32>,
-                        %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-
-  //      FILL-MATMUL:  %[[T0:.*]] = tensor.extract_slice
-  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Check the two operations are padded by the same pad tensor operation.
-  //      FILL-MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
-  //      FILL-MATMUL:  %[[T2:.*]] = linalg.fill {{.*}} outs(%[[T1]]
-  //      FILL-MATMUL:  %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
-  //      FILL-MATMUL:  = tensor.extract_slice %[[T3]]
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  pad_chain
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @pad_chain(%arg0: tensor<64x64xf32>,
-                     %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Check the matmul at the end of the use-def chain is padded.
-  //      MATMUL:  %[[T0:.*]] = linalg.fill
-  //      MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
-  //      MATMUL:  %[[T2:.*]] = linalg.matmul {{.*}} outs(%[[T1]]
-  //      MATMUL:  = tensor.extract_slice %[[T2]]
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  compose_padding
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @compose_padding(%arg0: tensor<64x64xf32>,
-                           %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-
-  //      MATMUL:  %[[SIZE:.*]] = affine.min
-  %size = affine.min #map0()[%iv0]
-
-  //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  // MATMUL-SAME:                                     [0, 0]
-  // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
-  //      MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
-  //      MATMUL:  %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
-  //      MATMUL:  %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
-  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<64x64xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
-  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32>
-  %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Check there are no additional pad tensor operations.
-  //  MATMUL-NOT:  tensor.pad
-
-  // Check the matmul directly uses the result of the fill operation.
-  //      MATMUL:  %[[T4:.*]] = linalg.matmul ins(%[[T3]]
-  //      MATMUL:  %[[T5:.*]] = tensor.extract_slice %[[T4]]
-  // MATMUL-SAME:                                     [0, 0]
-  // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
-  %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  //      MATMUL:  return %[[T5]]
-  func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  
diff erent_padding_values
-func.func @
diff erent_padding_values(%arg0: tensor<64x64xf32>,
-                                    %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 42.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<64x64xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
-  %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Different padding values prevent composing the paddings (42.0 vs. 0.0).
-  //      MATMUL:  = linalg.fill
-  //      MATMUL:  = tensor.pad
-  //      MATMUL:  = linalg.matmul
-  %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  
diff erent_padding_dynamic_sizes
-func.func @
diff erent_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
-                                           %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<64x64xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
-  %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
-  //      MATMUL:  = linalg.fill
-  //      MATMUL:  = tensor.pad
-  //      MATMUL:  = linalg.matmul
-  %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  
diff erent_padding_dynamic_rank
-func.func @
diff erent_padding_dynamic_rank(%arg0: tensor<64x64x1xf32>,
-                                          %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0, 0] [%size, %size, 1] [1, 1, 1] : tensor<64x64x1xf32> to tensor<?x?xf32>
-  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<64x64xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
-  %3 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
-  // Different dynamic ranks prevent composing the paddings ([%size, %size, 1] vs [%size, %size]).
-  //      MATMUL:  = linalg.fill
-  //      MATMUL:  = tensor.pad
-  //      MATMUL:  = linalg.matmul
-  %4 = linalg.matmul ins(%3, %3 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %4 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      MATMUL:  
diff erent_padding_static_sizes
-func.func @
diff erent_padding_static_sizes(%arg0: tensor<62x62xf32>,
-                                          %iv0 : index) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
-  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<62x62xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<62x62xf32>) -> tensor<62x62xf32>
-  %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
-
-  // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
-  //      MATMUL:  = linalg.fill
-  //      MATMUL:  = tensor.pad
-  //      MATMUL:  = linalg.matmul
-  %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      FILL:  scalar_operand
-// FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: f32
-// FILL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func.func @scalar_operand(%arg0: f32,
-                          %arg1: tensor<24x12xf32>,
-                          %iv0 : index) -> tensor<24x12xf32> {
-  %0 = affine.min #map0()[%iv0]
-
-  //      FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
-  //      FILL:   %[[T1:.*]] = tensor.pad %[[T0]] nofold
-  %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-
-  // Check only the fill output operand is padded.
-  //      FILL:   %[[T6:.*]] = linalg.fill ins(%[[ARG0]]{{.*}}outs(%[[T1]]
-  %2 = linalg.fill ins(%arg0 : f32) outs(%1 : tensor<4x?xf32>) -> tensor<4x?xf32>
-  %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
-  func.return %3 : tensor<24x12xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      MATMUL:  static_extract_slice_missing
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>,
-func.func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
-                                        %arg1: tensor<12x25xf32>,
-                                        %arg2: tensor<4x5xf32>,
-                                        %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
-  %0 = affine.min #map0()[%iv2]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-
-  // Check the matmul inputs are padded despite the missing slice for the static output.
-  //      MATMUL:  %[[T0:.*]] = tensor.pad
-  //      MATMUL:  %[[T1:.*]] = tensor.pad
-  //      MATMUL:  = linalg.matmul ins(%[[T0]], %[[T1]]
-  // MATMUL-SAME:                 outs(%[[ARG2]]
-  %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
-  func.return %3 : tensor<4x5xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      MATMUL:  dynamic_extract_slice_missing
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>,
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
-                                         %arg1: tensor<12x25xf32>,
-                                         %arg2: tensor<24x25xf32>,
-                                         %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
-  %0 = affine.min #map0()[%iv2]
-
-  //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
-  //      MATMUL:  %[[T1:.*]] = tensor.extract_slice %[[ARG2]]
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-  // Check the matmul is not padded due to the missing slice for the dynamic input.
-  //      MATMUL:  = linalg.matmul ins(%[[ARG0]], %[[T0]]
-  // MATMUL-SAME:                 outs(%[[T1]]
-  %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      INPUTS-ONLY:  static_input_padding_only
-// INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @static_input_padding_only(%arg0: tensor<24x12xf32>,
-                                     %arg1: tensor<12x25xf32>,
-                                     %arg2: tensor<24x25xf32>,
-                                     %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
-  %0 = affine.min #map0()[%iv2]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-
-  // INPUTS-ONLY:  %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
-  // Check the matmul inputs are padded despite the failure to compute a padding value for the static output.
-  // INPUTS-ONLY:  %[[T1:.*]] = tensor.pad
-  // INPUTS-ONLY:  %[[T2:.*]] = tensor.pad
-  // INPUTS-ONLY:  = linalg.matmul ins(%[[T1]], %[[T2]]
-  // INPUTS-ONLY-SAME:             outs(%[[T0]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      INPUTS-ONLY:  dynamic_input_padding_only
-// INPUTS-ONLY-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>,
-// INPUTS-ONLY-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
-// INPUTS-ONLY-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
-                                      %arg1: tensor<12x25xf32>,
-                                      %arg2: tensor<24x25xf32>,
-                                      %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
-  %0 = affine.min #map0()[%iv2]
-
-  // INPUTS-ONLY:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  // INPUTS-ONLY:  %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  // INPUTS-ONLY:  %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
-
-  // Check the matmul is not padded due to the failure to compute a padding value for the dynamic output.
-  // INPUTS-ONLY:  = linalg.matmul ins(%[[T0]], %[[T1]]
-  // INPUTS-ONLY-SAME:             outs(%[[T2]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32>
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-//      FILL:  rank_reducing
-// FILL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x64x1x64xf32>
-func.func @rank_reducing(%arg0: tensor<1x64x1x64xf32>,
-                         %iv0 : index) -> tensor<1x?x?xf32> {
-  //      FILL:  %[[CST:.*]] = arith.constant 1.
-  %cst = arith.constant 0.0 : f32
-  %size = affine.min #map0()[%iv0]
-  %0 = tensor.extract_slice %arg0[0, 0, 0, 0] [1, %size, 1, %size] [1, 1, 1, 1] : tensor<1x64x1x64xf32> to tensor<1x?x?xf32>
-
-  // Check the fill is padded despite the rank-reducing slice operation.
-  //      FILL:  %[[T0:.*]] = tensor.pad
-  //      FILL:  tensor.yield %[[CST]]
-  //      FILL:  %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  // FILL-SAME:    tensor<1x64x64xf32>
-  //      FILL:  = tensor.extract_slice %[[T1]]
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32>
-  func.return %1 : tensor<1x?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-//      PARTIAL:  padding_the_output_dims_only
-func.func @padding_the_output_dims_only(%arg0: tensor<24x12xf32>,
-                                        %arg1: tensor<12x25xf32>,
-                                        %arg2: tensor<24x25xf32>,
-                                        %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
-  //  PARTIAL-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  //  PARTIAL-DAG:  %[[TS:.*]] = affine.apply
-  %0 = affine.min #map0()[%iv2]
-
-  // Check only the output dimensions of the matmul are padded.
-  //      PARTIAL:  %[[T0:.*]] = tensor.pad
-  // PARTIAL-SAME:                 [%[[TS]], %[[C0]]
-  //      PARTIAL:  %[[T1:.*]] = tensor.pad
-  // PARTIAL-SAME:                 [%[[C0]], %[[TS]]
-  //      PARTIAL:  %[[T2:.*]] = tensor.pad
-  // PARTIAL-SAME:                 [%[[TS]], %[[TS]]
-  %1 = tensor.extract_slice %arg0[%iv0, %iv2] [%0, %0] [1, 1] : tensor<24x12xf32> to tensor<?x?xf32>
-  %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
-  %3 = tensor.extract_slice %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
-
-  //      PARTIAL:  = linalg.matmul ins(%[[T0]], %[[T1]]
-  // PARTIAL-SAME:             outs(%[[T2]]
-  %4 = linalg.matmul ins(%1, %2 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
-  func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-//  DEPTHWISE_CONV_2D-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (4, -s0 + 11)>
-//  DEPTHWISE_CONV_2D-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2)>
-//  DEPTHWISE_CONV_2D-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2 + 1)>
-//  DEPTHWISE_CONV_2D-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * -2 + 8)>
-//  DEPTHWISE_CONV_2D-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 4)>
-
-#map0 = affine_map<()[s0] -> (4, -s0 + 11)>
-#map1 = affine_map<()[s0] -> (s0 * 2)>
-#map2 = affine_map<()[s0] -> (s0 * 2 + 1)>
-
-//      DEPTHWISE_CONV_2D: depthwise_conv_2d_padding
-// DEPTHWISE_CONV_2D-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x23x3x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<3x3x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<1x13x1x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
-func.func @depthwise_conv_2d_padding(%arg0: tensor<1x23x3x16xf32>,
-                                     %arg1: tensor<3x3x16xf32>,
-                                     %arg2: tensor<1x13x1x16xf32>,
-                                     %iv0: index) -> tensor<1x?x1x16xf32> {
-  //  DEPTHWISE_CONV_2D-DAG: %[[CST:.*]] = arith.constant 0.
-  //  DEPTHWISE_CONV_2D-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //  DEPTHWISE_CONV_2D-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[IV0]]]
-  %0 = affine.min #map0()[%iv0]
-  %1 = affine.apply #map1()[%iv0]
-  %2 = affine.apply #map2()[%0]
-
-  //      DEPTHWISE_CONV_2D: %[[T3:.*]] = tensor.extract_slice %[[ARG0]]
-  //      DEPTHWISE_CONV_2D: %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
-  %3 = tensor.extract_slice %arg0[0, %1, 0, 0] [1, %2, 3, 16] [1, 1, 1, 1] : tensor<1x23x3x16xf32> to tensor<1x?x3x16xf32>
-  %4 = tensor.extract_slice %arg2[0, %iv0, 0, 0] [1, %0, 1, 16] [1, 1, 1, 1] : tensor<1x13x1x16xf32> to tensor<1x?x1x16xf32>
-
-  // Check the padding on the input.
-  //      DEPTHWISE_CONV_2D: %[[T5:.*]] = affine.apply #[[MAP3]]()[%[[T0]]]
-  //      DEPTHWISE_CONV_2D: %[[T6:.*]] = tensor.pad %[[T3]]
-  // DEPTHWISE_CONV_2D-SAME:                low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
-  // DEPTHWISE_CONV_2D-SAME:                high[%[[C0]], %[[T5]], %[[C0]], %[[C0]]]
-  //      DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
-
-  // Check the padding on the output.
-  //      DEPTHWISE_CONV_2D: %[[T7:.*]] = affine.apply #[[MAP4]]()[%[[T0]]]
-  //      DEPTHWISE_CONV_2D: %[[T8:.*]] = tensor.pad %[[T4]]
-  // DEPTHWISE_CONV_2D-SAME:                low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
-  // DEPTHWISE_CONV_2D-SAME:                high[%[[C0]], %[[T7]], %[[C0]], %[[C0]]]
-  //      DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
-
-  //      DEPTHWISE_CONV_2D: %[[T9:.*]] = linalg.depthwise_conv_2d_nhwc_hwc
-  // DEPTHWISE_CONV_2D-SAME: ins(%[[T6]], %[[ARG1]] : tensor<1x9x3x16xf32>, tensor<3x3x16xf32>)
-  // DEPTHWISE_CONV_2D-SAME: outs(%[[T8]] : tensor<1x4x1x16xf32>)
-  %5 = linalg.depthwise_conv_2d_nhwc_hwc
-      {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
-      ins(%3, %arg1 : tensor<1x?x3x16xf32>, tensor<3x3x16xf32>)
-      outs(%4 : tensor<1x?x1x16xf32>) -> tensor<1x?x1x16xf32>
-
-  // Check the extract_slice to crop the padded output before return.
-  //      DEPTHWISE_CONV_2D: %[[T10:.*]] = tensor.extract_slice %[[T9]][0, 0, 0, 0]
-  // DEPTHWISE_CONV_2D-SAME:                 [1, %[[T0]], 1, 16]
-  //      DEPTHWISE_CONV_2D: return %[[T10]] : tensor<1x?x1x16xf32>
-  return %5 : tensor<1x?x1x16xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
deleted file mode 100644
index 463d205ec2a14..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=0,0,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.elemwise_unary fuse tile-sizes=32,32,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=UNARY %s
-
-// MATMUL-LABEL: @tile_sizes_zero(
-func.func @tile_sizes_zero(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.0 : f32
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
-
-  //   MATMUL-NOT:   scf.for
-  //       MATMUL:   linalg.fill
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  //   MATMUL-NOT:   scf.for
-  //       MATMUL:   linalg.matmul
-  %result = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %result : tensor<?x?xf32>
-}
-
-// -----
-
-// UNARY-LABEL: @shape_only(
-func.func @shape_only(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.0 : f32
-
-  //       UNARY:   linalg.fill
-  %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  //       UNARY:   scf.for
-  //       UNARY:     scf.for
-  //   UNARY-NOT:       linalg.fill
-  //       UNARY:       linalg.elemwise_unary
-  %1 = linalg.elemwise_unary {fun = #linalg.unary_fn<exp>}
-      ins(%arg0 : tensor<?x?xf32>) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %1 : tensor<?x?xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
deleted file mode 100644
index ab709c69651aa..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
+++ /dev/null
@@ -1,323 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s
-
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
-//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 24, d0)>
-//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
-
-//      MATMUL:  fuse_input
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func.func @fuse_input(%arg0: tensor<24x12xf32>,
-                      %arg1: tensor<12x25xf32>,
-                      %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32>
-
-  //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
-  //      MATMUL:      %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
-  //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
-  //      MATMUL:        %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
-
-  // Tile both input operand dimensions.
-  //      MATMUL:        %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]])
-  //      MATMUL:        %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
-  //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  // MATMUL-SAME:                                          %[[IV1]], %[[IV2]]
-  // MATMUL-SAME:                                          %[[UB1]], %[[UB2]]
-  //      MATMUL:        %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T1]]
-  %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
-
-//      MATMUL:  fuse_output
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func.func @fuse_output(%arg0: tensor<24x12xf32>,
-                       %arg1: tensor<12x25xf32>,
-                       %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  //  MATMUL-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  //  MATMUL-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-
-  // Update the iteration argument of the outermost tile loop.
-  //      MATMUL:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
-  //      MATMUL:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
-  //      MATMUL:      %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
-  //      MATMUL:      %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]])
-
-  // Tile the both output operand dimensions.
-  //      MATMUL:      %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
-  // MATMUL-SAME:                                        %[[IV1]], %[[IV0]]
-  // MATMUL-SAME:                                        %[[TS1]], %[[TS0]]
-  //      MATMUL:      %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  //      MATMUL:        scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
-
-  // Check there is an extract/insert slice pair for the output operand.
-  //  MATMUL-DAG:          %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]]
-  //  MATMUL-DAG:          %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]]
-  //      MATMUL:          %[[T2:.*]] = tensor.extract_slice %[[ARG5]]
-  // MATMUL-SAME:                                            0, 0
-  // MATMUL-SAME:                                            %[[D0]], %[[D1]]
-  //      MATMUL:          %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
-  //      MATMUL:          %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]]
-  // MATMUL-SAME:                                            0, 0
-  // MATMUL-SAME:                                            %[[D0]], %[[D1]]
-  %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
-//  MATMUL-DAG:  #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
-//  MATMUL-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 25, d0)>
-//  MATMUL-DAG:  #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
-
-//      MATMUL:  fuse_reduction
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
-// MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
-func.func @fuse_reduction(%arg0: tensor<24x12xf32>,
-                          %arg1: tensor<12x25xf32>,
-                          %arg2: tensor<24x25xf32>,
-                          %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
-  ^bb0(%arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg4, %arg5 : f32
-    linalg.yield %2 : f32
-  } -> tensor<12x25xf32>
-
-  //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
-  //      MATMUL:      %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
-  //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
-  //      MATMUL:        %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
-  //      MATMUL:        %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
-  //      MATMUL:        %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]])
-
-  // Tile only the parallel dimensions but not the reduction dimension.
-  //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
-  // MATMUL-SAME:                                          %[[IV2]], 0, %[[IV0]]
-  // MATMUL-SAME:                                          %[[UB2]], 7, %[[UB0]]
-  //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  // MATMUL-SAME:                                          %[[IV2]], %[[IV0]]
-  // MATMUL-SAME:                                          %[[UB2]], %[[UB0]]
-  //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
-  //      MATMUL:        %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]]
-  %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-//      MATMUL:  fuse_transposed
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
-func.func @fuse_transposed(%arg0: tensor<24x12xf32>,
-                           %arg1: tensor<12x25xf32>,
-                           %arg2: tensor<24x25xf32>,
-                           %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
-  ^bb0(%arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg4, %arg5 : f32
-    linalg.yield %2 : f32
-  } -> tensor<24x12xf32>
-
-  //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
-  //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
-
-  // Swap the input operand slice offsets due to the transposed indexing map.
-  //      MATMUL:        %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
-  // MATMUL-SAME:                                          %[[IV2]], %[[IV1]]
-  //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
-  // MATMUL-SAME:                                          %[[IV1]], %[[IV2]]
-  //      MATMUL:        %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
-  //      MATMUL:        %{{.*}} = linalg.matmul ins(%[[T2]]
-  %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-//      MATMUL:  fuse_input_and_output
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
-                                 %arg1: tensor<12x25xf32>,
-                                 %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-
-  // Fuse both producers to the appropriate tile loops.
-  //      MATMUL:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
-  //      MATMUL:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
-  //      MATMUL:      %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
-  // MATMUL-SAME:                                        %[[IV1]], %[[IV0]]
-  //      MATMUL:      %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  //      MATMUL:        scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
-  //      MATMUL:          %[[T2:.*]] = tensor.extract_slice %[[ARG0]]
-  // MATMUL-SAME:                                            %[[IV1]], %[[IV2]]
-  //      MATMUL:          %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
-  //      MATMUL:          %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
-  //      MATMUL:          %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]]
-  %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
-  func.return %2 : tensor<24x25xf32>
-}
-
-// -----
-
-//  MATMUL-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-
-//      MATMUL:  fuse_indexed
-// MATMUL-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
-func.func @fuse_indexed(%arg0: tensor<24x12xi32>,
-                        %arg1: tensor<12x25xi32>,
-                        %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
-  ^bb0(%arg3: i32):
-    %6 = linalg.index 0 : index
-    %7 = linalg.index 1 : index
-    %8 = arith.addi %6, %7 : index
-    %9 = arith.index_cast %8 : index to i32
-    linalg.yield %9 : i32
-  } -> tensor<12x25xi32>
-
-  //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
-  //      MATMUL:    scf.for %[[IV1:[0-9a-zA-Z]*]] =
-  //      MATMUL:      scf.for %[[IV2:[0-9a-zA-Z]*]] =
-
-  // Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map.
-  //      MATMUL:        %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  // MATMUL-SAME:                                          %[[IV2]], %[[IV0]]
-  //      MATMUL:  linalg.generic {{.*}} outs(%[[T1]]
-  //      MATMUL:  %[[IDX0:.*]] = linalg.index 0
-  //      MATMUL:  %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]])
-  //      MATMUL:  %[[IDX1:.*]] = linalg.index 1
-  //      MATMUL:  %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
-  //      MATMUL:  %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
-  %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
-  func.return %1 : tensor<24x25xi32>
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
-//      GENERIC:  fuse_outermost_reduction
-// GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
-// GENERIC-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
-func.func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
-                                    %arg1: tensor<10xf32>) -> tensor<10xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32>
-
-  // Cannot fuse the output fill since the reduction loop is the outermost loop.
-  //      GENERIC:      %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG1]]
-  %1 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<10xf32>) -> tensor<10xf32>
-
-  //      GENERIC:  scf.for %[[IV0:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[T0]]
-  //      GENERIC:    scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
-
-  // MATMUL the input fill has been fused.
-  //      GENERIC:      %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
-  // GENERIC-SAME:                                        %[[IV1]], %[[IV0]]
-  //      GENERIC:      %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
-  //      GENERIC:      %[[T3:.*]] = tensor.extract_slice %[[ARG3]]
-  // GENERIC-SAME:                                        %[[IV1]]
-  //      GENERIC:  linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]]
-  %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):
-    %3 = arith.addf %arg2, %arg3 : f32
-    linalg.yield %3 : f32
-  } -> tensor<10xf32>
-  func.return %2 : tensor<10xf32>
-}
-
-// -----
-
-//  GENERIC-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-//  GENERIC-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (-d0 - d1 + 17, 8)>
-//  GENERIC-DAG:  #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (-d1 - d2 + 17, d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-//      GENERIC:  fuse_non_rectangular
-// GENERIC-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
-func.func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
-                                %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {
-
-  //  GENERIC-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  //  GENERIC-DAG:  %[[C4:.*]] = arith.constant 4 : index
-  //  GENERIC-DAG:  %[[C5:.*]] = arith.constant 5 : index
-  //  GENERIC-DAG:  %[[C8:.*]] = arith.constant 8 : index
-  //  GENERIC-DAG:  %[[C10:.*]] = arith.constant 10 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32>
-
-  //      GENERIC:  scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]]
-  //      GENERIC:    scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]]
-
-  // Compute producer on a hyper rectangular bounding box. Along the second dimenson,
-  // the offset is set to the sum of the induction variables, and the upper bound
-  // to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the
-  // induction variables.
-  //  GENERIC-DAG:      %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]]
-  //  GENERIC-DAG:      %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]]
-  //  GENERIC-DAG:      %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]]
-  //      GENERIC:      %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-  // GENERIC-SAME:                                        %[[IV1]], %[[SUM]]
-  // GENERIC-SAME:                                                , %[[UB1]]
-  //      GENERIC:      %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
-  %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):
-    %2 = arith.addf %arg2, %arg3 : f32
-    linalg.yield %2 : f32
-  } -> tensor<10x8xf32>
-  func.return %1 : tensor<10x8xf32>
-}

diff  --git a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
deleted file mode 100644
index 67b2c606f3648..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
+++ /dev/null
@@ -1,84 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.conv_2d fuse tile-sizes=4,4,0,0 tile-interchange=0,1,2,3 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=CONV %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=4,4,0 tile-interchange=0,1,2 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s
-
-//      CONV:  fuse_conv_chain
-// CONV-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<2x2xf32>
-// CONV-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<11x11xf32>
-// CONV-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<10x10xf32>
-// CONV-SAME:    %[[ARG3:[0-9a-zA-Z]*]]: tensor<9x9xf32>
-// CONV-SAME:    %[[ARG4:[0-9a-zA-Z]*]]: tensor<8x8xf32>
-func.func @fuse_conv_chain(%arg0: tensor<2x2xf32>,
-                              %arg1: tensor<11x11xf32>,
-                              %arg2: tensor<10x10xf32>,
-                              %arg3: tensor<9x9xf32>,
-                              %arg4: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  %cst = arith.constant 1.0 : f32
-
-  // Do not tile the filter fill since the filter dimensions are not tiled.
-  //      CONV:  %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]]
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32>
-
-  // Fuse all other operations.
-  //      CONV:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[ARG4]]
-  //      CONV:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG6:.*]] = %[[ARG5]]
-
-  //      CONV:          %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  // CONV-SAME:                                            %[[IV0]], %[[IV1]]
-  //      CONV:          %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
-  // CONV-SAME:                                            %[[IV0]], %[[IV1]]
-  //      CONV:          %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
-  //      CONV:          %[[T4:.*]] = linalg.conv_2d ins(%[[T1]], %[[T0]] : {{.*}} outs(%[[T3]]
-  %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<10x10xf32>) -> tensor<10x10xf32>
-  %2 = linalg.conv_2d ins(%arg1, %0 : tensor<11x11xf32>, tensor<2x2xf32>) outs(%1 : tensor<10x10xf32>) -> tensor<10x10xf32>
-
-  //      CONV:          %[[T5:.*]] = tensor.extract_slice %[[ARG3]]
-  // CONV-SAME:                                            %[[IV0]], %[[IV1]]
-  //      CONV:          %[[T6:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T5]]
-  //      CONV:          %[[T7:.*]] = linalg.conv_2d ins(%[[T4]], %[[T0]] : {{.*}} outs(%[[T6]]
-  %3 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<9x9xf32>) -> tensor<9x9xf32>
-  %4 = linalg.conv_2d ins(%2, %0 : tensor<10x10xf32>, tensor<2x2xf32>) outs(%3 : tensor<9x9xf32>) -> tensor<9x9xf32>
-
-  // Use the argument passed in by iteration argument.
-  //      CONV:          %[[T8:.*]] = tensor.extract_slice %[[ARG6]]
-  // CONV-SAME:                                            %[[IV0]], %[[IV1]]
-  //      CONV:          %[[T9:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T8]]
-  //      CONV:          %[[T5:.*]] = linalg.conv_2d ins(%[[T7]], %[[T0]] {{.*}} outs(%[[T9]]
-  %5 = linalg.fill ins(%cst : f32) outs(%arg4 : tensor<8x8xf32>) -> tensor<8x8xf32>
-  %6 = linalg.conv_2d ins(%4, %0 : tensor<9x9xf32>, tensor<2x2xf32>) outs(%5 : tensor<8x8xf32>) -> tensor<8x8xf32>
-  return %6 : tensor<8x8xf32>
-}
-
-// -----
-
-//      MATMUL:  fuse_matmul_chain
-// MATMUL-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<8x8xf32>
-func.func @fuse_matmul_chain(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  %c0 = arith.constant 0 : index
-  %c12 = arith.constant 12 : index
-  %c25 = arith.constant 25 : index
-  %c24 = arith.constant 24 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  // Do not tile rhs fill of the producer matmul since none of its loop dimension is tiled.
-  //      MATMUL:  %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]]
-  %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<8x8xf32>) -> tensor<8x8xf32>
-
-  //      MATMUL:  scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG1:.*]] = %[[ARG0]]
-  //      MATMUL:    scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[ARG1]]
-
-  // Only the outermost loop of the producer matmul is tiled.
-  //      MATMUL:      %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
-  // MATMUL-SAME:                                        %[[IV0]], 0
-  //      MATMUL:      %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
-  //      MATMUL:      %[[T3:.*]] = linalg.matmul ins(%[[T2]], %[[T0]] {{.*}}
-  %1 = linalg.matmul ins(%0, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>
-
-  // Use the argument passed in by iteration argument.
-  //      MATMUL:      %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
-  // MATMUL-SAME:                                        %[[IV0]], %[[IV1]]
-  //      MATMUL:      %[[T5:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T4]]
-  //      MATMUL:      %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T5]]
-  %2 = linalg.matmul ins(%1, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>
-  return %2 : tensor<8x8xf32>
-}

diff  --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
deleted file mode 100644
index 7e669c10a274e..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
-// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=memref.copy register-tile-sizes=4,32 vectorize" | \
-
-// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-vector-to-scf,lower-affine,convert-linalg-to-loops)" | \
-// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts" | \
-// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
-// Activate to dump assembly
-// R_UN:   -dump-object-file -object-filename=/tmp/a.o \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
-// Use tee to both print to stderr and FileCheck
-// RUN: tee -a /dev/stderr | FileCheck %s
-
-
-!elem_type_a = f32
-!elem_type_b = f32
-!elem_type_c = f32
-!row_major_A = memref<${M}x${K}x!elem_type_a>
-!row_major_B = memref<${K}x${N}x!elem_type_b>
-!row_major_C = memref<${M}x${N}x!elem_type_c>
-
-func.func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
-// TODO: activate manually for now.
-// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
-{
-  linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
-    outs(%c: !row_major_C)
-  return
-}
-
-func.func @print_perf(%iters: index, %total_time: f64) {
-  %c2 = arith.constant 2 : index
-  %cM = arith.constant ${M} : index
-  %cN = arith.constant ${N} : index
-  %cK = arith.constant ${K} : index
-
-  %mn = arith.muli %cM, %cN : index
-  %mnk = arith.muli %mn, %cK : index
-
-  // 2*M*N*K.
-  %flops_per_iter = arith.muli %c2, %mnk : index
-  %flops = arith.muli %iters, %flops_per_iter : index
-  %flops_i64 = arith.index_cast %flops : index to i64
-  %flops_f = arith.sitofp %flops_i64 : i64 to f64
-  %flops_per_s = arith.divf %flops_f, %total_time : f64
-  vector.print %flops_per_s : f64
-
-  return
-}
-
-func.func @main() {
-  %v0 = arith.constant 0.0 : !elem_type_a
-  %v1 = arith.constant 1.0 : !elem_type_a
-
-  %A = memref.alloc() : !row_major_A
-  %B = memref.alloc() : !row_major_B
-  %C = memref.alloc() : !row_major_C
-
-  linalg.fill ins(%v1 : !elem_type_a) outs(%A : !row_major_A)
-  linalg.fill ins(%v1 : !elem_type_b) outs(%B : !row_major_B)
-  linalg.fill ins(%v0 : !elem_type_c) outs(%C : !row_major_C)
-
-  %c0 = arith.constant 0: index
-  %c1 = arith.constant 1: index
-  %iters = arith.constant ${ITERS}: index
-
-  /// Run and dump performance for matmul.
-  /// Preheating run:
-  scf.for %arg0 = %c0 to %iters step %c1 {
-    %z = arith.constant 0.0 : !elem_type_c
-    linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C)
-    func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
-  }
-  %t_start_matmul = call @rtclock() : () -> f64
-  scf.for %arg0 = %c0 to %iters step %c1 {
-    // linalg.matmul writes %C in place, need to reset it to zero every time.
-    // This is accounts for about 10-15% perf hit on small sizes.
-    // Once linalg on tensors is ready, fusing fill at the register level will
-    // be easy.
-    %z = arith.constant 0.0 : !elem_type_c
-    linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C)
-    func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
-  }
-  %t_end_matmul = call @rtclock() : () -> f64
-  %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
-  call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
-
-  // CHECK: {{^0$}}
-  %C_ref = memref.alloc() : !row_major_C
-  linalg.fill ins(%v0 : !elem_type_c) outs(%C_ref : !row_major_C)
-  linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
-    outs(%C_ref: !row_major_C)
-  %act = memref.cast %C : !row_major_C to memref<*xf32>
-  %exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
-  %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
-  vector.print %errors : i64
-  memref.dealloc %C_ref : !row_major_C
-
-  memref.dealloc %A : !row_major_A
-  memref.dealloc %B : !row_major_B
-  memref.dealloc %C : !row_major_C
-
-  return
-}
-
-func.func private @rtclock() -> f64
-func.func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }
-
-// TODO: init with random, run and check output.
-// func private @fill_random_f32(memref<*xf32>)

diff  --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index b85b88ad8a71f..e11c49fe6d696 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -1,6 +1,5 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRLinalgTestPasses
-  TestLinalgCodegenStrategy.cpp
   TestLinalgElementwiseFusion.cpp
   TestLinalgFusionTransforms.cpp
   TestLinalgHoisting.cpp

diff  --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
deleted file mode 100644
index efe8f38bc23bc..0000000000000
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-//===- TestLinalgCodegenStrategy.cpp - Test Linalg codegen strategy -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements logic for testing the Linalg codegen strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#include <utility>
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
-
-#include "llvm/ADT/SetVector.h"
-
-using namespace mlir;
-using namespace mlir::linalg;
-
-namespace {
-struct TestLinalgCodegenStrategy
-    : public PassWrapper<TestLinalgCodegenStrategy,
-                         OperationPass<func::FuncOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestLinalgCodegenStrategy)
-
-  StringRef getArgument() const final { return "test-linalg-codegen-strategy"; }
-  StringRef getDescription() const final {
-    return "Test Linalg Codegen Strategy.";
-  }
-  TestLinalgCodegenStrategy() = default;
-  TestLinalgCodegenStrategy(const TestLinalgCodegenStrategy &pass)
-      : PassWrapper(pass) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    // clang-format off
-    registry.insert<AffineDialect,
-                    gpu::GPUDialect,
-                    linalg::LinalgDialect,
-                    memref::MemRefDialect,
-                    scf::SCFDialect,
-                    vector::VectorDialect>();
-    // clang-format on
-  }
-
-  template <typename LinalgNamedOp>
-  void applyStrategyToNamedLinalgOp();
-
-  void runOnOperation() override;
-
-  void runStrategy(const LinalgTilingAndFusionOptions &tilingAndFusionOptions,
-                   const LinalgTilingOptions &tilingOptions,
-                   const LinalgTilingOptions &registerTilingOptions,
-                   LinalgPaddingOptions paddingOptions,
-                   vector::VectorContractLowering vectorContractLowering,
-                   vector::VectorTransferSplit vectorTransferSplit);
-
-  Option<bool> fuse{
-      *this, "fuse",
-      llvm::cl::desc("Fuse the producers after tiling the root op."),
-      llvm::cl::init(false)};
-  ListOption<int64_t> tileSizes{*this, "tile-sizes",
-                                llvm::cl::desc("Specifies the tile sizes.")};
-  ListOption<int64_t> tileInterchange{
-      *this, "tile-interchange",
-      llvm::cl::desc("Specifies the tile interchange.")};
-
-  Option<bool> promote{
-      *this, "promote",
-      llvm::cl::desc("Promote the tile into a small aligned memory buffer."),
-      llvm::cl::init(false)};
-  Option<bool> promoteFullTile{
-      *this, "promote-full-tile-pad",
-      llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."),
-      llvm::cl::init(false)};
-  ListOption<int64_t> registerTileSizes{
-      *this, "register-tile-sizes",
-      llvm::cl::desc(
-          "Specifies the size of the register tile that will be used "
-          " to vectorize")};
-  Option<bool> registerPromote{
-      *this, "register-promote",
-      llvm::cl::desc(
-          "Promote the register tile into a small aligned memory buffer."),
-      llvm::cl::init(false)};
-  Option<bool> registerPromoteFullTile{
-      *this, "register-promote-full-tile-pad",
-      llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."),
-      llvm::cl::init(false)};
-  Option<bool> pad{*this, "pad", llvm::cl::desc("Pad the operands."),
-                   llvm::cl::init(false)};
-  ListOption<std::string> paddingValues{
-      *this, "padding-values",
-      llvm::cl::desc("Operand padding values parsed by the attribute parser.")};
-  ListOption<int64_t> paddingDimensions{
-      *this, "padding-dimensions",
-      llvm::cl::desc("Operation iterator dimensions to pad.")};
-  ListOption<int64_t> packPaddings{*this, "pack-paddings",
-                                   llvm::cl::desc("Operand packing flags.")};
-  ListOption<int64_t> hoistPaddings{*this, "hoist-paddings",
-                                    llvm::cl::desc("Operand hoisting depths.")};
-  ListOption<SmallVector<int64_t>> transposePaddings{
-      *this, "transpose-paddings",
-      llvm::cl::desc(
-          "Transpose paddings. Specify a operand dimension interchange "
-          "using the following format:\n"
-          "-transpose-paddings=[1,0,2],[0,1],[0,1]\n"
-          "It defines the interchange [1, 0, 2] for operand one and "
-          "the interchange [0, 1] (no transpose) for the remaining operands."
-          "All interchange vectors have to be permuations matching the "
-          "operand rank.")};
-  Option<bool> generalize{*this, "generalize",
-                          llvm::cl::desc("Generalize named operations."),
-                          llvm::cl::init(false)};
-  ListOption<int64_t> iteratorInterchange{
-      *this, "iterator-interchange",
-      llvm::cl::desc("Specifies the iterator interchange.")};
-  Option<bool> decompose{
-      *this, "decompose",
-      llvm::cl::desc("Decompose convolutions to lower dimensional ones."),
-      llvm::cl::init(false)};
-  Option<bool> vectorize{
-      *this, "vectorize",
-      llvm::cl::desc("Rewrite the linalg op as a vector operation."),
-      llvm::cl::init(false)};
-  Option<bool> vectorizePadding{
-      *this, "vectorize-padding",
-      llvm::cl::desc("Rewrite pad tensor ops as vector operations."),
-      llvm::cl::init(false)};
-  Option<std::string> splitVectorTransfersTo{
-      *this, "split-transfers",
-      llvm::cl::desc(
-          "Split vector transfers between slow (masked) and fast "
-          "(unmasked) variants. Possible options are:\n"
-          "\tnone: keep unsplit vector.transfer and pay the full price\n"
-          "\tmemref.copy: use linalg.fill + memref.copy for the slow path\n"
-          "\tvector-transfers: use extra small unmasked vector.transfer for"
-          " the slow path\n"),
-      llvm::cl::init("none")};
-  Option<std::string> vectorizeContractionTo{
-      *this, "vectorize-contraction-to",
-      llvm::cl::desc("the type of vector op to use for linalg contractions"),
-      llvm::cl::init("outerproduct")};
-  Option<bool> unrollVectorTransfers{
-      *this, "unroll-vector-transfers",
-      llvm::cl::desc("Enable full unrolling of vector.transfer operations"),
-      llvm::cl::init(false)};
-  Option<bool> runEnablePass{
-      *this, "run-enable-pass",
-      llvm::cl::desc("Run the enable pass between transformations"),
-      llvm::cl::init(true)};
-  Option<std::string> anchorOpName{
-      *this, "anchor-op",
-      llvm::cl::desc(
-          "Which single linalg op is the anchor for the codegen strategy to "
-          "latch on:\n"
-          "\tlinalg.matmul: anchor on linalg.matmul\n"
-          "\tlinalg.matmul_column_major: anchor on linalg.matmul_column_major\n"
-          "\tmemref.copy: anchor on memref.copy\n"
-          "\tlinalg.fill: anchor on linalg.fill\n"),
-      llvm::cl::init("")};
-  Option<std::string> anchorFuncOpName{
-      *this, "anchor-func",
-      llvm::cl::desc(
-          "Which single func op is the anchor for the codegen strategy to "
-          "latch on."),
-      llvm::cl::init("")};
-};
-
-void TestLinalgCodegenStrategy::runStrategy(
-    const LinalgTilingAndFusionOptions &tilingAndFusionOptions,
-    const LinalgTilingOptions &tilingOptions,
-    const LinalgTilingOptions &registerTilingOptions,
-    LinalgPaddingOptions paddingOptions,
-    vector::VectorContractLowering vectorContractLowering,
-    vector::VectorTransferSplit vectorTransferSplit) {
-  std::string anchorOpNameOrWildcard = fuse ? "" : anchorOpName.getValue();
-  CodegenStrategy strategy;
-  strategy
-      .tileAndFuseIf(fuse && !tileSizes.empty(), anchorOpName,
-                     tilingAndFusionOptions)
-      .tileIf(!fuse && !tileSizes.empty(), anchorOpName, tilingOptions)
-      .promoteIf(!fuse && promote, anchorOpName,
-                 LinalgPromotionOptions()
-                     .setAlignment(16)
-                     .setUseFullTileBuffersByDefault(promoteFullTile))
-      .tileIf(!fuse && !registerTileSizes.empty(), anchorOpName,
-              registerTilingOptions)
-      .promoteIf(!fuse && registerPromote, anchorOpName,
-                 LinalgPromotionOptions()
-                     .setAlignment(16)
-                     .setUseFullTileBuffersByDefault(registerPromoteFullTile))
-      .padIf(pad, anchorOpNameOrWildcard, std::move(paddingOptions))
-      .decomposeIf(decompose)
-      .generalizeIf(generalize, anchorOpNameOrWildcard)
-      .interchangeIf(!iteratorInterchange.empty(), iteratorInterchange)
-      .vectorizeIf(vectorize, anchorOpNameOrWildcard, nullptr, vectorizePadding)
-      .vectorLowering(
-          LinalgVectorLoweringOptions()
-              .setVectorTransformsOptions(
-                  vector::VectorTransformsOptions()
-                      .setVectorTransformsOptions(vectorContractLowering)
-                      .setVectorTransferSplit(vectorTransferSplit))
-              .setVectorTransferToSCFOptions(
-                  VectorTransferToSCFOptions().enableFullUnroll(
-                      unrollVectorTransfers))
-              .enableTransferPartialRewrite()
-              .enableContractionLowering()
-              .enableTransferToSCFConversion());
-  // Created a nested OpPassManager and run.
-  func::FuncOp funcOp = getOperation();
-  OpPassManager dynamicPM("func.func");
-  strategy.configurePassPipeline(dynamicPM, funcOp.getContext(), runEnablePass);
-  if (failed(runPipeline(dynamicPM, funcOp)))
-    return signalPassFailure();
-}
-} // namespace
-
-/// Apply transformations specified as patterns.
-void TestLinalgCodegenStrategy::runOnOperation() {
-  if (!anchorFuncOpName.empty() && anchorFuncOpName != getOperation().getName())
-    return;
-
-  LinalgTilingAndFusionOptions tilingAndFusionOptions;
-  tilingAndFusionOptions.tileSizes = {tileSizes.begin(), tileSizes.end()};
-  tilingAndFusionOptions.tileInterchange = {tileInterchange.begin(),
-                                            tileInterchange.end()};
-
-  LinalgTilingOptions tilingOptions;
-  if (!tileSizes.empty())
-    tilingOptions = tilingOptions.setTileSizes(tileSizes);
-  if (!tileInterchange.empty())
-    tilingOptions = tilingOptions.setInterchange(
-        SmallVector<unsigned>(tileInterchange.begin(), tileInterchange.end()));
-
-  LinalgTilingOptions registerTilingOptions;
-  if (!registerTileSizes.empty())
-    registerTilingOptions =
-        registerTilingOptions.setTileSizes(registerTileSizes);
-
-  // Parse the padding values.
-  SmallVector<Attribute> paddingValueAttributes;
-  for (const std::string &paddingValue : paddingValues) {
-    paddingValueAttributes.push_back(
-        parseAttribute(paddingValue, &getContext()));
-  }
-
-  // Parse the transpose vectors.
-  LinalgPaddingOptions paddingOptions;
-  paddingOptions.setPaddingValues(paddingValueAttributes);
-  paddingOptions.setPaddingDimensions(
-      SmallVector<int64_t>{paddingDimensions.begin(), paddingDimensions.end()});
-  paddingOptions.setPackPaddings(
-      SmallVector<bool>{packPaddings.begin(), packPaddings.end()});
-  paddingOptions.setHoistPaddings(
-      SmallVector<int64_t>{hoistPaddings.begin(), hoistPaddings.end()});
-  paddingOptions.setTransposePaddings(transposePaddings);
-
-  vector::VectorContractLowering vectorContractLowering =
-      llvm::StringSwitch<vector::VectorContractLowering>(
-          vectorizeContractionTo.getValue())
-          .Case("matrixintrinsics", vector::VectorContractLowering::Matmul)
-          .Case("dot", vector::VectorContractLowering::Dot)
-          .Case("outerproduct", vector::VectorContractLowering::OuterProduct)
-          .Default(vector::VectorContractLowering::OuterProduct);
-  vector::VectorTransferSplit vectorTransferSplit =
-      llvm::StringSwitch<vector::VectorTransferSplit>(
-          splitVectorTransfersTo.getValue())
-          .Case("none", vector::VectorTransferSplit::None)
-          .Case("memref-copy", vector::VectorTransferSplit::LinalgCopy)
-          .Case("vector-transfers", vector::VectorTransferSplit::VectorTransfer)
-          .Default(vector::VectorTransferSplit::None);
-
-  runStrategy(tilingAndFusionOptions, tilingOptions, registerTilingOptions,
-              paddingOptions, vectorContractLowering, vectorTransferSplit);
-}
-
-namespace mlir {
-namespace test {
-void registerTestLinalgCodegenStrategy() {
-  PassRegistration<TestLinalgCodegenStrategy>();
-}
-} // namespace test
-} // namespace mlir

diff  --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index c9b1b492eefaf..778c569c5ce16 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -86,7 +86,6 @@ void registerTestGenericIRVisitorsPass();
 void registerTestGenericIRVisitorsInterruptPass();
 void registerTestInterfaces();
 void registerTestLastModifiedPass();
-void registerTestLinalgCodegenStrategy();
 void registerTestLinalgElementwiseFusion();
 void registerTestLinalgFusionTransforms();
 void registerTestLinalgTensorFusionTransforms();
@@ -185,7 +184,6 @@ void registerTestPasses() {
   mlir::test::registerTestGenericIRVisitorsPass();
   mlir::test::registerTestInterfaces();
   mlir::test::registerTestLastModifiedPass();
-  mlir::test::registerTestLinalgCodegenStrategy();
   mlir::test::registerTestLinalgElementwiseFusion();
   mlir::test::registerTestLinalgFusionTransforms();
   mlir::test::registerTestLinalgTensorFusionTransforms();


        


More information about the Mlir-commits mailing list