[Mlir-commits] [mlir] cd6e02e - [mlir][Linalg] Retire TestLinalgCodegenStrategy pass.
Nicolas Vasilache
llvmlistbot at llvm.org
Wed Jul 13 04:20:49 PDT 2022
Author: Nicolas Vasilache
Date: 2022-07-13T04:20:42-07:00
New Revision: cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9
URL: https://github.com/llvm/llvm-project/commit/cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9
DIFF: https://github.com/llvm/llvm-project/commit/cd6e02eebcfbad13c6e075d620310ce2fb5b5ea9.diff
LOG: [mlir][Linalg] Retire TestLinalgCodegenStrategy pass.
This pass tests patterns that are already tested elsewhere by applying them in a semi-targeted
fashion using anchor function and op names.
>From now on, targeted tests should use the transform dialect interpreter.
Differential Revision: https://reviews.llvm.org/D129627
Added:
Modified:
mlir/test/lib/Dialect/Linalg/CMakeLists.txt
mlir/tools/mlir-opt/mlir-opt.cpp
Removed:
mlir/test/Dialect/Linalg/codegen-strategy.mlir
mlir/test/Dialect/Linalg/decompose-convolution.mlir
mlir/test/Dialect/Linalg/hoist-padding.mlir
mlir/test/Dialect/Linalg/interchange.mlir
mlir/test/Dialect/Linalg/pad.mlir
mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
################################################################################
diff --git a/mlir/test/Dialect/Linalg/codegen-strategy.mlir b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
deleted file mode 100644
index 05f99635e85c6..0000000000000
--- a/mlir/test/Dialect/Linalg/codegen-strategy.mlir
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=2,4,8 vectorize vectorize-contraction-to=matrixintrinsics unroll-vector-transfers=true" -split-input-file | FileCheck %s --check-prefix=CHECK-INTRINSIC
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 promote promote-full-tile-pad register-tile-sizes=2,4,8 vectorize vectorize-contraction-to=outerproduct split-transfers=true unroll-vector-transfers=false" -split-input-file | FileCheck %s --check-prefix=CHECK-OUTER
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 tile-interchange=1,2,0 generalize iterator-interchange=0,2,1" -split-input-file | FileCheck %s --check-prefix=CHECK-INTERCHANGE
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 hoist-paddings=3,3,0" -split-input-file | FileCheck %s --check-prefix=CHECK-PAD
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 vectorize" -split-input-file | FileCheck %s --check-prefix=CHECK-FUSE
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=conv anchor-op=linalg.conv_2d_nhwc_hwcf tile-sizes=1,1,8,32,1,1,8 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 decompose vectorize vectorize-padding" -split-input-file | FileCheck %s --check-prefix=CHECK-DECOMP
-
-// CHECK-INTRINSIC: func @matmul(
-// CHECK-OUTER: func @matmul(
-func.func @matmul(%arg0: memref<72x72xf32>, %arg1: memref<72x72xf32>, %arg2: memref<72x72xf32>) {
-
- // Check the matrix intrinsic lowering is triggered.
- // CHECK-INTRINSIC: vector.matrix_multiply
- // CHECK-INTRINSIC-SAME: {lhs_columns = 8 : i32, lhs_rows = 2 : i32, rhs_columns = 4 : i32}
- // CHECK-INTRINSIC-SAME: (vector<16xf32>, vector<32xf32>) -> vector<8xf32>
-
- // Check the outer product lowering is triggered.
- // CHECK-OUTER: vector.outerproduct {{.*}} : vector<2xf32>, vector<4xf32>
- linalg.matmul ins(%arg0, %arg1: memref<72x72xf32>, memref<72x72xf32>) outs(%arg2: memref<72x72xf32>)
- func.return
-}
-
-// -----
-
-// CHECK-INTERCHANGE: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
- // CHECK-INTERCHANGE-DAG: %[[C16:.*]] = arith.constant 16
- // CHECK-INTERCHANGE-DAG: %[[C32:.*]] = arith.constant 32
- // CHECK-INTERCHANGE-DAG: %[[C64:.*]] = arith.constant 64
-
- // Check the tile loops are interchanged.
- // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C32]]
- // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C64]]
- // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C16]]
-
- // Check the operation has been generalized and interchanged.
- // CHECK-INTERCHANGE: linalg.generic
- // CHECK-INTERCHANGE-SAME: iterator_types = ["parallel", "reduction", "parallel"]
- %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32>
- func.return %0 : tensor<72x72xf32>
-}
-
-// -----
-
-// CHECK-PAD-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 72, 16)>
-
-// CHECK-PAD: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
-
- // Check the padding of the input operands has been hoisted out of the tile loop nest.
- // CHECK-PAD-COUNT=2: tensor.pad %{{.*}} nofold
- // CHECK-PAD: scf.for
- // Check CSE eliminates the duplicate min operations introduced by tiling.
- // CHECK-PAD: affine.min #[[MAP0]]
- // CHECK-PAD-NOT: affine.min #[[MAP0]]
- // CHECK-PAD-COUNT=2: scf.for
- // CHECK-PAD: linalg.matmul
- %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32>
- func.return %0 : tensor<72x72xf32>
-}
-
-// -----
-
-// CHECK-FUSE: func @matmul(
-func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
-
- // Check the padding and vectorization applies to the fill operation due to the empty anchor op string.
- // CHECK-FUSE: %[[CST:.*]] = arith.constant dense<0.000000e+00>
- // CHECK-FUSE: vector.transfer_write %[[CST]]
- %cst = arith.constant 0.0 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<72x72xf32>) -> tensor<72x72xf32>
-
- // Check the matmul is padded and vectorized despite the empty anchor op string.
- // CHECK-FUSE: vector.outerproduct
- %1 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%0: tensor<72x72xf32>) -> tensor<72x72xf32>
- func.return %1 : tensor<72x72xf32>
-}
-
-// -----
-
-// CHECK-DECOMP: func @conv(
-func.func @conv(%arg0: tensor<8x18x17x32xf32>, %arg1: tensor<3x3x32x64xf32>, %arg2: tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32>
-
- // Check the conv is padded by a rank-reducing vector transfer op pair.
- // CHECK-DECOMP: vector.transfer_read {{.*}}: tensor<1x1x?x8xf32>, vector<1x8x8xf32>
- // CHECK-DECOMP: vector.outerproduct
- // CHECK-DECOMP: vector.transfer_write {{.*}}: vector<1x8x32xf32>, tensor<1x1x?x32xf32>
- %1 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x17x32xf32>, tensor<3x3x32x64xf32>) outs(%0 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32>
- func.return %1 : tensor<8x16x15x64xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/decompose-convolution.mlir b/mlir/test/Dialect/Linalg/decompose-convolution.mlir
deleted file mode 100644
index ad900a568c709..0000000000000
--- a/mlir/test/Dialect/Linalg/decompose-convolution.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: mlir-opt -test-linalg-codegen-strategy="decompose" -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor
-// CHECK-SAME: (%[[INPUT:.+]]: tensor<4x1x6x3xf32>, %[[FILTER:.+]]: tensor<1x2x3x8xf32>, %[[INIT:.+]]: tensor<4x1x2x8xf32>)
-func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x1x6x3xf32>, %filter: tensor<1x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> {
- %0 = linalg.conv_2d_nhwc_hwcf
- {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>}
- ins(%input, %filter : tensor<4x1x6x3xf32>, tensor<1x2x3x8xf32>)
- outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32>
- return %0 : tensor<4x1x2x8xf32>
-}
-
-// CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]]
-// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 6, 3] [1, 1, 1, 1] : tensor<4x1x6x3xf32> to tensor<4x6x3xf32>
-// CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]]
-// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [1, 2, 3, 8] [1, 1, 1, 1] : tensor<1x2x3x8xf32> to tensor<2x3x8xf32>
-// CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]]
-// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x1x2x8xf32> to tensor<4x2x8xf32>
-// CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf
-// CHECK-SAME: dilations = dense<3> : vector<1xi64>
-// CHECK-SAME: strides = dense<2> : vector<1xi64>
-// CHECK-SAME: ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor<4x6x3xf32>, tensor<2x3x8xf32>)
-// CHECK-SAME: outs(%[[INIT_1D]] : tensor<4x2x8xf32>)
-// CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]]
-// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x2x8xf32> into tensor<4x1x2x8xf32>
-// CHECK: return %[[CONV_2D]]
-// -----
-
-// CHECK-LABEL: func @conv2d_nhwc_qxqx1xq_tensor
-// CHECK-SAME: (%[[INPUT:.+]]: tensor<?x?x1x?xf32>, %[[FILTER:.+]]: tensor<?x1x?x?xf32>, %[[INIT:.+]]: tensor<?x?x1x?xf32>)
-func.func @conv2d_nhwc_qxqx1xq_tensor(%input: tensor<?x?x1x?xf32>, %filter: tensor<?x1x?x?xf32>, %init: tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32> {
- %0 = linalg.conv_2d_nhwc_hwcf
- {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>}
- ins(%input, %filter : tensor<?x?x1x?xf32>, tensor<?x1x?x?xf32>)
- outs(%init : tensor<?x?x1x?xf32>) -> tensor<?x?x1x?xf32>
- return %0 : tensor<?x?x1x?xf32>
-}
-
-// CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]]
-// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-// CHECK-SAME: tensor<?x?x1x?xf32> to tensor<?x?x?xf32>
-// CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]]
-// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] :
-// CHECK-SAME: tensor<?x1x?x?xf32> to tensor<?x?x?xf32>
-// CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]]
-// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-// CHECK-SAME: tensor<?x?x1x?xf32> to tensor<?x?x?xf32>
-// CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf
-// CHECK-SAME: dilations = dense<2> : vector<1xi64>
-// CHECK-SAME: strides = dense<3> : vector<1xi64>
-// CHECK-SAME: ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-// CHECK-SAME: outs(%[[INIT_1D]] : tensor<?x?x?xf32>)
-// CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]]
-// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] :
-// CHECK-SAME: tensor<?x?x?xf32> into tensor<?x?x1x?xf32>
-// CHECK: return %[[CONV_2D]]
-
-// -----
-
-// Do not convert convolution ops whose window dimensions are not ones.
-
-// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor
-func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x3x5x3xf32>, %filter: tensor<2x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> {
- // CHECK: linalg.conv_2d_nhwc_hwcf
- %0 = linalg.conv_2d_nhwc_hwcf
- {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%input, %filter : tensor<4x3x5x3xf32>, tensor<2x2x3x8xf32>)
- outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32>
- return %0 : tensor<4x1x2x8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor
-func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x1x113x96xf32>, %filter: tensor<1x3x96xf32>, %out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32> {
- // CHECK: linalg.depthwise_conv_1d_nwc_wc
- %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
- ins(%input, %filter: tensor<1x1x113x96xf32>, tensor<1x3x96xf32>)
- outs(%out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32>
- return %0: tensor<1x1x56x96xf32>
-}
-
-// -----
-
-// Do not convert convolution ops whose window dimensions are not ones.
-
-// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor
-func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>, %out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32> {
- // CHECK: linalg.depthwise_conv_2d_nhwc_hwc
- %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
- ins(%input, %filter: tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
- outs(%out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
- return %0: tensor<1x56x56x96xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
deleted file mode 100644
index 5ac26232d9c0e..0000000000000
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ /dev/null
@@ -1,480 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATVEC
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 transpose-paddings=[1,0],[0],[0] run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=TRANSP
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL
-
-// MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-
-// MATVEC: static_size_divisible
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @static_size_divisible(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c4 = arith.constant 4 : index
-
- // Pack the vector tiles for all values of IV (IVx4).
- // MATVEC: = linalg.init_tensor [3, 4]
- // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
- // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DIV4]](%[[PIV0]])
- // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [4]
- // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]]
- // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Index the packed vector.
- // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]])
- // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
- %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = tensor.pad %2 nofold low[%c0] high[%c0] {
- ^bb0(%arg5: index):
- tensor.yield %cst : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the packed input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
- %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %4 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 12, 5)>
-// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 5)>
-// MATVEC-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
-#map0 = affine_map<(d0) -> (5, -d0 + 12)>
-#map1 = affine_map<(d0) -> (-d0 + 5)>
-
-// MATVEC: static_size_not_divisible
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @static_size_not_divisible(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c5 = arith.constant 5 : index
-
- // Pack the vector tiles for all values of IV (IVx5).
- // MATVEC: = linalg.init_tensor [3, 5]
- // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
- // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
- // MATVEC: %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])
- // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
- // MATVEC: %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
- // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
- // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %1 = affine.min #map0(%arg3)
- %2 = tensor.extract_slice %arg0[0, %arg3] [24, %1] [1, 1] : tensor<24x12xf32> to tensor<24x?xf32>
-
- // Index the packed vector.
- // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV0]])
- // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
- %3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor<?xf32>
- %4 = affine.apply #map1(%1)
- %5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4] {
- ^bb0(%arg5: index, %arg6: index):
- tensor.yield %cst : f32
- } : tensor<24x?xf32> to tensor<24x5xf32>
- %6 = tensor.pad %3 low[%c0] high[%4] {
- ^bb0(%arg5: index):
- tensor.yield %cst : f32
- } : tensor<?xf32> to tensor<5xf32>
-
- // Check matvec uses the packed input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
- %7 = linalg.matvec ins(%5, %6 : tensor<24x5xf32>, tensor<5xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %7 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC-DAG: #[[SDIV4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
-// MATVEC-DAG: #[[DDIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
-// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 4)>
-#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map1 = affine_map<(d0) -> (-d0 + 4)>
-
-// MATVEC: dynamic_size
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<?xf32>
-func.func @dynamic_size(%arg0: tensor<24x?xf32>,
- %arg1: tensor<?xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c4 = arith.constant 4 : index
-
- // MATVEC: %[[D0:.*]] = tensor.dim
- %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32>
-
- // Pack the vector tiles for all values of IV (IVx4).
- // MATVEC: %[[PS0:.*]] = affine.apply #[[SDIV4]]()[%[[D0]]]
- // MATVEC: = linalg.init_tensor [%[[PS0]], 4]
- // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
- // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DDIV4]](%[[PIV0]])
- // MATVEC: %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])[%[[D0]]]
- // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
- // MATVEC: %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
- // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
- // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %2 = affine.min #map0(%arg3)[%0]
- %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>
-
- // Index the packed vector.
- // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DDIV4]](%[[IV0]])
- // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
- %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
- %5 = affine.apply #map1(%2)
- %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] {
- ^bb0(%arg5: index, %arg6: index):
- tensor.yield %cst : f32
- } : tensor<24x?xf32> to tensor<24x4xf32>
- %7 = tensor.pad %4 nofold low[%c0] high[%5] {
- ^bb0(%arg5: index):
- tensor.yield %cst : f32
- } : tensor<?xf32> to tensor<4xf32>
-
- // Check matvec uses the packed input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]]
- %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %8 : tensor<24xf32>
- }
- return %1 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC: non_constant_padding
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @non_constant_padding(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %c4 = arith.constant 4 : index
- %c12 = arith.constant 12 : index
- %c0 = arith.constant 0 : index
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Check the non constant padding is not hoisted.
- // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
- // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]
- %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = tensor.pad %2 nofold low[%c0] high[%c0] {
- ^bb0(%arg5: index):
- %5 = arith.index_cast %arg3 : index to i32
- %6 = arith.sitofp %5 : i32 to f32
- tensor.yield %6 : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the padded input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
- %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %4 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC: non_constant_op_padding
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-func.func @non_constant_op_padding(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c4 = arith.constant 4 : index
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Check the non constant op padding is not hoisted.
- // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
- // MATVEC: %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]]
- // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]
- // MATVEC: tensor.yield %[[V0]]
- %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = tensor.extract %arg1[%arg3] : tensor<12xf32>
- %4 = tensor.pad %2 nofold low[%c0] high[%c0] {
- ^bb0(%arg5: index):
- tensor.yield %3 : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the padded input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
- %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %5 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC: non_index_operand
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: i32
-func.func @non_index_operand(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>,
- %arg3: i32) -> tensor<24xf32> {
- %c4 = arith.constant 4 : index
- %c12 = arith.constant 12 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Check the index_cast prevents hoisting due to its non index operand.
- // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
- // MATVEC: %[[IDX0:.*]] = arith.index_cast %[[ARG3]]
- // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
- %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = arith.index_cast %arg3 : i32 to index
- %4 = tensor.pad %2 nofold low[%3] high[%3] {
- ^bb0(%arg6: index):
- tensor.yield %cst : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the padded input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
- %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %5 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC: memory_effect
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: memref<?xindex>
-func.func @memory_effect(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>,
- %arg3: memref<?xindex>) -> tensor<24xf32> {
- %c4 = arith.constant 4 : index
- %c12 = arith.constant 12 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Check the load prevents hoisting due to its memory effect.
- // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
- // MATVEC: %[[IDX0:.*]] = memref.load %[[ARG3]]
- // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
- %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = memref.load %arg3[%c0] : memref<?xindex>
- %4 = tensor.pad %2 nofold low[%3] high[%3] {
- ^bb0(%arg6: index):
- tensor.yield %cst : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the padded input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
- %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %5 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-// MATVEC: index_result_loop
-// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32>
-// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: index
-func.func @index_result_loop(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12xf32>,
- %arg2: tensor<24xf32>,
- %arg3: index) -> tensor<24xf32> {
- %c4 = arith.constant 4 : index
- %c12 = arith.constant 12 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
-
- // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) {
- %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
-
- // Check the unexpected operation with a region prevents hoisting.
- // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
- // MATVEC: %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]]
- // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
- %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
- %3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) {
- %6 = arith.addi %arg3, %arg7 : index
- scf.yield %6 : index
- }
- %4 = tensor.pad %2 nofold low[%3] high[%3] {
- ^bb0(%arg6: index):
- tensor.yield %cst : f32
- } : tensor<4xf32> to tensor<4xf32>
-
- // Check matvec uses the padded input vector.
- // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]]
- %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %5 : tensor<24xf32>
- }
- return %0 : tensor<24xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (-d0 + 12, 5)>
-#map1 = affine_map<(d0) -> (-d0 + 5)>
-
-// MATMUL: tile_and_fuse
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<12x6xf32>
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<6x24xf32>
-func.func @tile_and_fuse(%arg0: tensor<12x6xf32>,
- %arg1: tensor<6x24xf32>,
- %arg2: tensor<12x24xf32>) -> tensor<12x24xf32> {
- %c6 = arith.constant 6 : index
- %c3 = arith.constant 3 : index
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c5 = arith.constant 5 : index
- %cst = arith.constant 0.000000e+00 : f32
-
- // Check the second input operand is hoisted by two loop nests.
- // MATMUL: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // MATMUL: %[[T2:.*]] = tensor.pad %[[T1]]
-
- // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<12x24xf32>) {
- %1 = affine.min #map0(%arg3)
-
- // Check the extract_slice op introduced by the double tiling does not prevent the hoisting.
- %2 = tensor.extract_slice %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<12x24xf32> to tensor<?x24xf32>
- %3 = affine.apply #map1(%1)
-
- // Check the fused and padded fill op does not prevent hoisting.
- %4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0] {
- ^bb0(%arg5: index, %arg6: index):
- tensor.yield %cst : f32
- } : tensor<?x24xf32> to tensor<5x24xf32>
- %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<5x24xf32>) -> tensor<5x24xf32>
- %6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
-
- // Check the first input operand is hoisted by one loop nest.
- // MATMUL: %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
- // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL: %[[T5:.*]] = tensor.pad %[[T4]]
-
- // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
- %7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor<?x24xf32>) {
-
- // Index the packed operands.
- // MATMUL-DAG: %[[T6:.*]] = tensor.extract_slice %[[T3]]
- // MATMUL-DAG: %[[T7:.*]] = tensor.extract_slice %[[T0]]
- %9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor<?x3xf32>
- %10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32>
- %11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> to tensor<?x24xf32>
- %12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0] {
- ^bb0(%arg7: index, %arg8: index):
- tensor.yield %cst : f32
- } : tensor<?x3xf32> to tensor<5x3xf32>
- %13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0] {
- ^bb0(%arg7: index, %arg8: index):
- tensor.yield %cst : f32
- } : tensor<3x24xf32> to tensor<3x24xf32>
-
- // Check the output padding is not hoisted.
- // MATMUL: %[[T8:.*]] = tensor.pad
- %14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0] {
- ^bb0(%arg7: index, %arg8: index):
- tensor.yield %cst : f32
- } : tensor<?x24xf32> to tensor<5x24xf32>
-
- // Check matmul uses the padded operands.
- // MATMUL: = linalg.matmul ins(%[[T6]], %[[T7]] {{.*}} outs(%[[T8]]
- %15 = linalg.matmul ins(%12, %13 : tensor<5x3xf32>, tensor<3x24xf32>) outs(%14 : tensor<5x24xf32>) -> tensor<5x24xf32>
- %16 = tensor.extract_slice %15[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
- %17 = tensor.insert_slice %16 into %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<?x24xf32>
- scf.yield %17 : tensor<?x24xf32>
- }
- %8 = tensor.insert_slice %7 into %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<?x24xf32> into tensor<12x24xf32>
- scf.yield %8 : tensor<12x24xf32>
- }
- return %0 : tensor<12x24xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
-#map1 = affine_map<(d0) -> (-d0 + 4)>
-
-// TRANSP: transpose
-// TRANSP-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x?xf32>
-func.func @transpose(%arg0: tensor<24x?xf32>,
- %arg1: tensor<?xf32>,
- %arg2: tensor<24xf32>) -> tensor<24xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c4 = arith.constant 4 : index
- %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32>
-
- // Transpose the padded matrix.
- // TRANSP: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = {{.*}}iter_args(%[[T1:.*]] =
- // TRANSP: %[[T2:.*]] = tensor.pad
- // TRANSP: %[[T3:.*]] = tensor.extract_slice %[[T1]]
- // TRANSP: %[[T4:.*]] = linalg.generic
- // TRANSP-SAME: ins(%[[T2]] : tensor<24x4xf32>
- // TRANSP-SAME: outs(%[[T3]] : tensor<4x24xf32>
- // TRANSP: %[[T5:.*]] = tensor.insert_slice %[[T4]] into %[[T1]]
- // TRANSP: scf.yield %[[T5:.*]]
-
- // TRANSP: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) {
- %2 = affine.min #map0(%arg3)[%0]
- %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32>
-
- // Index the packed vector and transpose back.
- // TRANSP: %[[T6:.*]] = tensor.extract_slice %[[T0]]
- // TRANSP: %[[T7:.*]] = linalg.init_tensor
- // TRANSP: %[[T8:.*]] = linalg.generic
- // TRANSP-SAME: ins(%[[T6]] : tensor<4x24xf32>
- // TRANSP-SAME: outs(%[[T7]] : tensor<24x4xf32>
- %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
- %5 = affine.apply #map1(%2)
- %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] {
- ^bb0(%arg5: index, %arg6: index): // no predecessors
- tensor.yield %cst : f32
- } : tensor<24x?xf32> to tensor<24x4xf32>
- %7 = tensor.pad %4 nofold low[%c0] high[%5] {
- ^bb0(%arg5: index): // no predecessors
- tensor.yield %cst : f32
- } : tensor<?xf32> to tensor<4xf32>
-
- // Check matvec uses the packed input vector.
- // TRANSP: = linalg.matvec ins(%[[T8]]
- %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32>
- scf.yield %8 : tensor<24xf32>
- }
- return %1 : tensor<24xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/interchange.mlir b/mlir/test/Dialect/Linalg/interchange.mlir
deleted file mode 100644
index 1d422eef242b9..0000000000000
--- a/mlir/test/Dialect/Linalg/interchange.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" -test-linalg-codegen-strategy="iterator-interchange=1,3,4,2,0" | FileCheck --check-prefix=CANCEL-OUT %s
-
-#map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
-
-func.func @interchange_generic_op(%arg0 : memref<1x2x3x4x5xindex>, %arg1 : memref<1x2x4xindex>) {
- linalg.generic {
- indexing_maps = [#map0, #map1],
- iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]}
- ins(%arg0 : memref<1x2x3x4x5xindex>)
- outs(%arg1 : memref<1x2x4xindex>) {
- ^bb0(%arg2 : index, %arg3 : index) :
- %0 = linalg.index 0 : index
- %1 = linalg.index 1 : index
- %2 = linalg.index 4 : index
- %3 = arith.subi %0, %1 : index
- %4 = arith.addi %3, %2 : index
- %5 = arith.addi %4, %arg2 : index
- linalg.yield %5 : index
- }
- return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4, d2, d0)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d2)>
-// CHECK: func @interchange_generic_op
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CHECK-SAME: iterator_types = ["reduction", "parallel", "parallel", "parallel", "reduction"]
-// CHECK-DAG: %[[IDX0:.+]] = linalg.index 1 : index
-// CHECK-DAG: %[[IDX1:.+]] = linalg.index 3 : index
-// CHECK-DAG: %[[IDX4:.+]] = linalg.index 0 : index
-// CHECK: %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index
-// CHECK: %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index
-// CHECK: %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index
-
-// CANCEL-OUT-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CANCEL-OUT-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>
-// CANCEL-OUT: func @interchange_generic_op
-// CANCEL-OUT: linalg.generic
-// CANCEL-OUT-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CANCEL-OUT-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]
-// CANCEL-OUT-DAG: %[[IDX0:.+]] = linalg.index 0 : index
-// CANCEL-OUT-DAG: %[[IDX1:.+]] = linalg.index 1 : index
-// CANCEL-OUT-DAG: %[[IDX4:.+]] = linalg.index 4 : index
-// CANCEL-OUT: %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index
-// CANCEL-OUT: %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index
-// CANCEL-OUT: %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index
-
-
diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
deleted file mode 100644
index 0e0e2e1066d6e..0000000000000
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ /dev/null
@@ -1,600 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=MATMUL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,1.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,0.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL-MATMUL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32 pack-paddings=1,1,0 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1 pack-paddings=1,1,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=PARTIAL
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.depthwise_conv_2d_nhwc_hwc pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=1,2 pack-paddings=1,0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=DEPTHWISE_CONV_2D
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 12, 7)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (-s0 + 12, 7)>
-
-// MATMUL: static_sizes_output_divisible
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- // MATMUL-DAG: %[[CST:.*]] = arith.constant 0.
- // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
-
- // MATMUL: %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]]
- %0 = affine.min #map()[%iv2]
-
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
- %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
- %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
- // Check statically sized matmul inputs with partially divisible sizes are padded.
- // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
- // MATMUL: %[[T3:.*]] = tensor.pad %[[T0]] nofold
- // MATMUL-SAME: [%[[C0]], %[[C0]]]
- // MATMUL-SAME: [%[[C0]], %[[V0]]
- // MATMUL: tensor.yield %[[CST]]
- // MATMUL: %[[T4:.*]] = tensor.pad %[[T1]] nofold
-
- // Check the statically sized matmul output with fully divisible sizes is not padded.
- // MATMUL: %[[T5:.*]] = linalg.matmul
- // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>)
- // MATMUL-SAME: outs(%[[T2]] : tensor<4x5xf32>)
- // MATMUL: %[[T6:.*]] = tensor.insert_slice %[[T5]]
- %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
- %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
- func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 25, 7)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
-#map = affine_map<()[s0] -> (-s0 + 25, 7)>
-
-// MATMUL: static_sizes_input_divisible
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- // MATMUL-DAG: %[[CST:.*]] = arith.constant 0.
- // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
-
- %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32>
-
- // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]]
- %4 = affine.min #map()[%iv1]
- %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
-
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
- %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
-
- // Check the statically sized matmul output with partially divisible sizes is padded.
- // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
- // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]] low
- // MATMUL-SAME: [%[[C0]], %[[C0]]]
- // MATMUL-SAME: [%[[C0]], %[[V0]]
- // MATMUL: tensor.yield %[[CST]]
-
- // MATMUL: %[[T2:.*]] = linalg.matmul
- // MATMUL-SAME: outs(%[[T1]] : tensor<4x7xf32>)
- // MATMUL: %[[T3:.*]] = tensor.extract_slice %[[T2]]
- // MATMUL: %[[T4:.*]] = tensor.insert_slice %[[T3]]
- %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32>
- %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
-
- // MATMUL: return %[[T4]]
- func.return %8 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 5)>
-// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 7)>
-// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 6)>
-// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)>
-// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)>
-
-#map0 = affine_map<()[s0, s1] -> (-s0 + s1, 5)>
-#map1 = affine_map<()[s0, s1] -> (-s0 + s1, 6)>
-#map2 = affine_map<()[s0, s1] -> (-s0 + s1, 7)>
-
-// MATMUL: dynamic_sizes
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
-// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index
-// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index
-func.func @dynamic_sizes(%arg0: tensor<?x?xf32>,
- %arg1: tensor<?x?xf32>,
- %arg2: tensor<?x?xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<?x?xf32> {
- // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
- // MATMUL-DAG: %[[C1:.*]] = arith.constant 1
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
-
- // MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
- // MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
- // MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
- %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
- %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
- %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-
- // MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]]
- // MATMUL: %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]]
- // MATMUL: %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]]
- %6 = affine.min #map0()[%iv0, %0]
- %7 = affine.min #map1()[%iv2, %1]
- %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %9 = affine.min #map2()[%iv1, %2]
- %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-
- // Check all matmul operands are padded.
- // MATMUL: %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
- // MATMUL: %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
- // MATMUL: %[[T3:.*]] = tensor.pad %{{.*}} nofold
- // MATMUL-SAME: [%[[C0]], %[[C0]]]
- // MATMUL-SAME: [%[[V0]], %[[V1]]
- // MATMUL: %[[T4:.*]] = tensor.pad %{{.*}} nofold
- // MATMUL: %[[T5:.*]] = tensor.pad %{{.*}} low
-
- // Check the dynamic matmul has been erased.
- // MATMUL-NOT: = linalg.matmul {{.*}} tensor<?x?xf32>
-
- // Check all padded matmul operands are statically sized.
- // MATMUL: %[[T6:.*]] = linalg.matmul
- // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>)
- // MATMUL-SAME: outs(%[[T5]] : tensor<5x7xf32>)
- // MATMUL: %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]]
- // MATMUL: %[[T8:.*]] = tensor.insert_slice %[[T7]]
- %12 = linalg.matmul ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
- %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
- // MATMUL: return %[[T8]]
- func.return %13 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// FILL-MATMUL: pad_multiple
-// FILL-MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @pad_multiple(%arg0: tensor<64x64xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
-
- // FILL-MATMUL: %[[T0:.*]] = tensor.extract_slice
- %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Check the two operations are padded by the same pad tensor operation.
- // FILL-MATMUL: %[[T1:.*]] = tensor.pad %[[T0]]
- // FILL-MATMUL: %[[T2:.*]] = linalg.fill {{.*}} outs(%[[T1]]
- // FILL-MATMUL: %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
- // FILL-MATMUL: = tensor.extract_slice %[[T3]]
- %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
- %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL: pad_chain
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @pad_chain(%arg0: tensor<64x64xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Check the matmul at the end of the use-def chain is padded.
- // MATMUL: %[[T0:.*]] = linalg.fill
- // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]]
- // MATMUL: %[[T2:.*]] = linalg.matmul {{.*}} outs(%[[T1]]
- // MATMUL: = tensor.extract_slice %[[T2]]
- %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
- %2 = linalg.matmul ins(%0, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL: compose_padding
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
-func.func @compose_padding(%arg0: tensor<64x64xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
-
- // MATMUL: %[[SIZE:.*]] = affine.min
- %size = affine.min #map0()[%iv0]
-
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL-SAME: [0, 0]
- // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]]
- // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]]
- // MATMUL: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
- // MATMUL: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
- %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
- %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<?x?xf32> to tensor<64x64xf32>
- %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
- %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32>
- %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Check there are no additional pad tensor operations.
- // MATMUL-NOT: tensor.pad
-
- // Check the matmul directly uses the result of the fill operation.
- // MATMUL: %[[T4:.*]] = linalg.matmul ins(%[[T3]]
- // MATMUL: %[[T5:.*]] = tensor.extract_slice %[[T4]]
- // MATMUL-SAME: [0, 0]
- // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]]
- %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
-
- // MATMUL: return %[[T5]]
- func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL:
diff erent_padding_values
-func.func @
diff erent_padding_values(%arg0: tensor<64x64xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 42.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
- %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<?x?xf32> to tensor<64x64xf32>
- %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
- %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Different padding values prevent composing the paddings (42.0 vs. 0.0).
- // MATMUL: = linalg.fill
- // MATMUL: = tensor.pad
- // MATMUL: = linalg.matmul
- %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL:
diff erent_padding_dynamic_sizes
-func.func @
diff erent_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
- %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<?x?xf32> to tensor<64x64xf32>
- %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
- %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
- // MATMUL: = linalg.fill
- // MATMUL: = tensor.pad
- // MATMUL: = linalg.matmul
- %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL:
diff erent_padding_dynamic_rank
-func.func @
diff erent_padding_dynamic_rank(%arg0: tensor<64x64x1xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0, 0] [%size, %size, 1] [1, 1, 1] : tensor<64x64x1xf32> to tensor<?x?xf32>
- %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<?x?xf32> to tensor<64x64xf32>
- %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32>
- %3 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-
- // Different dynamic ranks prevent composing the paddings ([%size, %size, 1] vs [%size, %size]).
- // MATMUL: = linalg.fill
- // MATMUL: = tensor.pad
- // MATMUL: = linalg.matmul
- %4 = linalg.matmul ins(%3, %3 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %4 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// MATMUL:
diff erent_padding_static_sizes
-func.func @
diff erent_padding_static_sizes(%arg0: tensor<62x62xf32>,
- %iv0 : index) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
- %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<?x?xf32> to tensor<62x62xf32>
- %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<62x62xf32>) -> tensor<62x62xf32>
- %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
-
- // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
- // MATMUL: = linalg.fill
- // MATMUL: = tensor.pad
- // MATMUL: = linalg.matmul
- %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %5 : tensor<?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// FILL: scalar_operand
-// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: f32
-// FILL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func.func @scalar_operand(%arg0: f32,
- %arg1: tensor<24x12xf32>,
- %iv0 : index) -> tensor<24x12xf32> {
- %0 = affine.min #map0()[%iv0]
-
- // FILL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
- // FILL: %[[T1:.*]] = tensor.pad %[[T0]] nofold
- %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
-
- // Check only the fill output operand is padded.
- // FILL: %[[T6:.*]] = linalg.fill ins(%[[ARG0]]{{.*}}outs(%[[T1]]
- %2 = linalg.fill ins(%arg0 : f32) outs(%1 : tensor<4x?xf32>) -> tensor<4x?xf32>
- %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32>
- func.return %3 : tensor<24x12xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// MATMUL: static_extract_slice_missing
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>,
-func.func @static_extract_slice_missing(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<4x5xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> {
- %0 = affine.min #map0()[%iv2]
- %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-
- // Check the matmul inputs are padded despite the missing slice for the static output.
- // MATMUL: %[[T0:.*]] = tensor.pad
- // MATMUL: %[[T1:.*]] = tensor.pad
- // MATMUL: = linalg.matmul ins(%[[T0]], %[[T1]]
- // MATMUL-SAME: outs(%[[ARG2]]
- %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
- func.return %3 : tensor<4x5xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// MATMUL: dynamic_extract_slice_missing
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>,
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- %0 = affine.min #map0()[%iv2]
-
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG2]]
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
- %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
- // Check the matmul is not padded due to the missing slice for the dynamic input.
- // MATMUL: = linalg.matmul ins(%[[ARG0]], %[[T0]]
- // MATMUL-SAME: outs(%[[T1]]
- %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
- %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
- func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// INPUTS-ONLY: static_input_padding_only
-// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @static_input_padding_only(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- %0 = affine.min #map0()[%iv2]
- %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
-
- // INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG2]]
- %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
-
- // Check the matmul inputs are padded despite the failure to compute a padding value for the static output.
- // INPUTS-ONLY: %[[T1:.*]] = tensor.pad
- // INPUTS-ONLY: %[[T2:.*]] = tensor.pad
- // INPUTS-ONLY: = linalg.matmul ins(%[[T1]], %[[T2]]
- // INPUTS-ONLY-SAME: outs(%[[T0]]
- %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
- %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
- func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// INPUTS-ONLY: dynamic_input_padding_only
-// INPUTS-ONLY-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>,
-// INPUTS-ONLY-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>,
-// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>,
-func.func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- %0 = affine.min #map0()[%iv2]
-
- // INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
- // INPUTS-ONLY: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // INPUTS-ONLY: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
- %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
- %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32>
-
- // Check the matmul is not padded due to the failure to compute a padding value for the dynamic output.
- // INPUTS-ONLY: = linalg.matmul ins(%[[T0]], %[[T1]]
- // INPUTS-ONLY-SAME: outs(%[[T2]]
- %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32>
- %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32>
- func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (64, s0)>
-
-// FILL: rank_reducing
-// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x64x1x64xf32>
-func.func @rank_reducing(%arg0: tensor<1x64x1x64xf32>,
- %iv0 : index) -> tensor<1x?x?xf32> {
- // FILL: %[[CST:.*]] = arith.constant 1.
- %cst = arith.constant 0.0 : f32
- %size = affine.min #map0()[%iv0]
- %0 = tensor.extract_slice %arg0[0, 0, 0, 0] [1, %size, 1, %size] [1, 1, 1, 1] : tensor<1x64x1x64xf32> to tensor<1x?x?xf32>
-
- // Check the fill is padded despite the rank-reducing slice operation.
- // FILL: %[[T0:.*]] = tensor.pad
- // FILL: tensor.yield %[[CST]]
- // FILL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
- // FILL-SAME: tensor<1x64x64xf32>
- // FILL: = tensor.extract_slice %[[T1]]
- %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32>
- func.return %1 : tensor<1x?x?xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> (7, s0)>
-
-// PARTIAL: padding_the_output_dims_only
-func.func @padding_the_output_dims_only(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
- // PARTIAL-DAG: %[[C0:.*]] = arith.constant 0 : index
- // PARTIAL-DAG: %[[TS:.*]] = affine.apply
- %0 = affine.min #map0()[%iv2]
-
- // Check only the output dimensions of the matmul are padded.
- // PARTIAL: %[[T0:.*]] = tensor.pad
- // PARTIAL-SAME: [%[[TS]], %[[C0]]
- // PARTIAL: %[[T1:.*]] = tensor.pad
- // PARTIAL-SAME: [%[[C0]], %[[TS]]
- // PARTIAL: %[[T2:.*]] = tensor.pad
- // PARTIAL-SAME: [%[[TS]], %[[TS]]
- %1 = tensor.extract_slice %arg0[%iv0, %iv2] [%0, %0] [1, 1] : tensor<24x12xf32> to tensor<?x?xf32>
- %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor<?x?xf32>
- %3 = tensor.extract_slice %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
-
- // PARTIAL: = linalg.matmul ins(%[[T0]], %[[T1]]
- // PARTIAL-SAME: outs(%[[T2]]
- %4 = linalg.matmul ins(%1, %2 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
- %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
- func.return %5 : tensor<24x25xf32>
-}
-
-// -----
-
-// DEPTHWISE_CONV_2D-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (4, -s0 + 11)>
-// DEPTHWISE_CONV_2D-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2)>
-// DEPTHWISE_CONV_2D-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2 + 1)>
-// DEPTHWISE_CONV_2D-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * -2 + 8)>
-// DEPTHWISE_CONV_2D-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 4)>
-
-#map0 = affine_map<()[s0] -> (4, -s0 + 11)>
-#map1 = affine_map<()[s0] -> (s0 * 2)>
-#map2 = affine_map<()[s0] -> (s0 * 2 + 1)>
-
-// DEPTHWISE_CONV_2D: depthwise_conv_2d_padding
-// DEPTHWISE_CONV_2D-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x23x3x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<3x3x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<1x13x1x16xf32>
-// DEPTHWISE_CONV_2D-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
-func.func @depthwise_conv_2d_padding(%arg0: tensor<1x23x3x16xf32>,
- %arg1: tensor<3x3x16xf32>,
- %arg2: tensor<1x13x1x16xf32>,
- %iv0: index) -> tensor<1x?x1x16xf32> {
- // DEPTHWISE_CONV_2D-DAG: %[[CST:.*]] = arith.constant 0.
- // DEPTHWISE_CONV_2D-DAG: %[[C0:.*]] = arith.constant 0 : index
- // DEPTHWISE_CONV_2D-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[IV0]]]
- %0 = affine.min #map0()[%iv0]
- %1 = affine.apply #map1()[%iv0]
- %2 = affine.apply #map2()[%0]
-
- // DEPTHWISE_CONV_2D: %[[T3:.*]] = tensor.extract_slice %[[ARG0]]
- // DEPTHWISE_CONV_2D: %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
- %3 = tensor.extract_slice %arg0[0, %1, 0, 0] [1, %2, 3, 16] [1, 1, 1, 1] : tensor<1x23x3x16xf32> to tensor<1x?x3x16xf32>
- %4 = tensor.extract_slice %arg2[0, %iv0, 0, 0] [1, %0, 1, 16] [1, 1, 1, 1] : tensor<1x13x1x16xf32> to tensor<1x?x1x16xf32>
-
- // Check the padding on the input.
- // DEPTHWISE_CONV_2D: %[[T5:.*]] = affine.apply #[[MAP3]]()[%[[T0]]]
- // DEPTHWISE_CONV_2D: %[[T6:.*]] = tensor.pad %[[T3]]
- // DEPTHWISE_CONV_2D-SAME: low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
- // DEPTHWISE_CONV_2D-SAME: high[%[[C0]], %[[T5]], %[[C0]], %[[C0]]]
- // DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
-
- // Check the padding on the output.
- // DEPTHWISE_CONV_2D: %[[T7:.*]] = affine.apply #[[MAP4]]()[%[[T0]]]
- // DEPTHWISE_CONV_2D: %[[T8:.*]] = tensor.pad %[[T4]]
- // DEPTHWISE_CONV_2D-SAME: low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
- // DEPTHWISE_CONV_2D-SAME: high[%[[C0]], %[[T7]], %[[C0]], %[[C0]]]
- // DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
-
- // DEPTHWISE_CONV_2D: %[[T9:.*]] = linalg.depthwise_conv_2d_nhwc_hwc
- // DEPTHWISE_CONV_2D-SAME: ins(%[[T6]], %[[ARG1]] : tensor<1x9x3x16xf32>, tensor<3x3x16xf32>)
- // DEPTHWISE_CONV_2D-SAME: outs(%[[T8]] : tensor<1x4x1x16xf32>)
- %5 = linalg.depthwise_conv_2d_nhwc_hwc
- {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%3, %arg1 : tensor<1x?x3x16xf32>, tensor<3x3x16xf32>)
- outs(%4 : tensor<1x?x1x16xf32>) -> tensor<1x?x1x16xf32>
-
- // Check the extract_slice to crop the padded output before return.
- // DEPTHWISE_CONV_2D: %[[T10:.*]] = tensor.extract_slice %[[T9]][0, 0, 0, 0]
- // DEPTHWISE_CONV_2D-SAME: [1, %[[T0]], 1, 16]
- // DEPTHWISE_CONV_2D: return %[[T10]] : tensor<1x?x1x16xf32>
- return %5 : tensor<1x?x1x16xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
deleted file mode 100644
index 463d205ec2a14..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=0,0,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.elemwise_unary fuse tile-sizes=32,32,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=UNARY %s
-
-// MATMUL-LABEL: @tile_sizes_zero(
-func.func @tile_sizes_zero(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst = arith.constant 0.0 : f32
- %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
- %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
- %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
-
- // MATMUL-NOT: scf.for
- // MATMUL: linalg.fill
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
-
- // MATMUL-NOT: scf.for
- // MATMUL: linalg.matmul
- %result = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %result : tensor<?x?xf32>
-}
-
-// -----
-
-// UNARY-LABEL: @shape_only(
-func.func @shape_only(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
- %cst = arith.constant 0.0 : f32
-
- // UNARY: linalg.fill
- %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-
- // UNARY: scf.for
- // UNARY: scf.for
- // UNARY-NOT: linalg.fill
- // UNARY: linalg.elemwise_unary
- %1 = linalg.elemwise_unary {fun = #linalg.unary_fn<exp>}
- ins(%arg0 : tensor<?x?xf32>) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
- func.return %1 : tensor<?x?xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
deleted file mode 100644
index ab709c69651aa..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir
+++ /dev/null
@@ -1,323 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s
-
-// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
-// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
-// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 24, d0)>
-// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
-
-// MATMUL: fuse_input
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-func.func @fuse_input(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32>
-
- // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
- // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
- // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
- // MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
-
- // Tile both input operand dimensions.
- // MATMUL: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]])
- // MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL-SAME: %[[IV1]], %[[IV2]]
- // MATMUL-SAME: %[[UB1]], %[[UB2]]
- // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
- // MATMUL: %{{.*}} = linalg.matmul ins(%[[T1]]
- %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
- func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
-// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
-
-// MATMUL: fuse_output
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func.func @fuse_output(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
- // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index
- // MATMUL-DAG: %[[C1:.*]] = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-
- // Update the iteration argument of the outermost tile loop.
- // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
- // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
- // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]])
- // MATMUL: %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]])
-
- // Tile the both output operand dimensions.
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
- // MATMUL-SAME: %[[IV1]], %[[IV0]]
- // MATMUL-SAME: %[[TS1]], %[[TS0]]
- // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
- // MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
-
- // Check there is an extract/insert slice pair for the output operand.
- // MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]]
- // MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]]
- // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG5]]
- // MATMUL-SAME: 0, 0
- // MATMUL-SAME: %[[D0]], %[[D1]]
- // MATMUL: %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]]
- // MATMUL: %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]]
- // MATMUL-SAME: 0, 0
- // MATMUL-SAME: %[[D0]], %[[D1]]
- %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32>
- func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 25, 4)>
-// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)>
-// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 25, d0)>
-// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
-
-// MATMUL: fuse_reduction
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
-// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32>
-func.func @fuse_reduction(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) {
- ^bb0(%arg4: f32, %arg5: f32):
- %2 = arith.addf %arg4, %arg5 : f32
- linalg.yield %2 : f32
- } -> tensor<12x25xf32>
-
- // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
- // MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
- // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
- // MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]])
- // MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]])
- // MATMUL: %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]])
-
- // Tile only the parallel dimensions but not the reduction dimension.
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
- // MATMUL-SAME: %[[IV2]], 0, %[[IV0]]
- // MATMUL-SAME: %[[UB2]], 7, %[[UB0]]
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // MATMUL-SAME: %[[IV2]], %[[IV0]]
- // MATMUL-SAME: %[[UB2]], %[[UB0]]
- // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
- // MATMUL: %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]]
- %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
- func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-// MATMUL: fuse_transposed
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32>
-func.func @fuse_transposed(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>,
- %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) {
- ^bb0(%arg4: f32, %arg5: f32):
- %2 = arith.addf %arg4, %arg5 : f32
- linalg.yield %2 : f32
- } -> tensor<24x12xf32>
-
- // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
-
- // Swap the input operand slice offsets due to the transposed indexing map.
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]]
- // MATMUL-SAME: %[[IV2]], %[[IV1]]
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL-SAME: %[[IV1]], %[[IV2]]
- // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]]
- // MATMUL: %{{.*}} = linalg.matmul ins(%[[T2]]
- %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
- func.return %1 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL: fuse_input_and_output
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
-// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func.func @fuse_input_and_output(%arg0: tensor<24x12xf32>,
- %arg1: tensor<12x25xf32>,
- %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32>
- %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
-
- // Fuse both producers to the appropriate tile loops.
- // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
- // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]]
- // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]]
- // MATMUL-SAME: %[[IV1]], %[[IV0]]
- // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
- // MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]]
- // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL-SAME: %[[IV1]], %[[IV2]]
- // MATMUL: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
- // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG5]]
- // MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]]
- %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32>
- func.return %2 : tensor<24x25xf32>
-}
-
-// -----
-
-// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-
-// MATMUL: fuse_indexed
-// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32>
-func.func @fuse_indexed(%arg0: tensor<24x12xi32>,
- %arg1: tensor<12x25xi32>,
- %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) {
- ^bb0(%arg3: i32):
- %6 = linalg.index 0 : index
- %7 = linalg.index 1 : index
- %8 = arith.addi %6, %7 : index
- %9 = arith.index_cast %8 : index to i32
- linalg.yield %9 : i32
- } -> tensor<12x25xi32>
-
- // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] =
- // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] =
-
- // Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map.
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // MATMUL-SAME: %[[IV2]], %[[IV0]]
- // MATMUL: linalg.generic {{.*}} outs(%[[T1]]
- // MATMUL: %[[IDX0:.*]] = linalg.index 0
- // MATMUL: %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]])
- // MATMUL: %[[IDX1:.*]] = linalg.index 1
- // MATMUL: %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]])
- // MATMUL: %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]]
- %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32>
- func.return %1 : tensor<24x25xi32>
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
-// GENERIC: fuse_outermost_reduction
-// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
-// GENERIC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32>
-func.func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>,
- %arg1: tensor<10xf32>) -> tensor<10xf32> {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32>
-
- // Cannot fuse the output fill since the reduction loop is the outermost loop.
- // GENERIC: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG1]]
- %1 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<10xf32>) -> tensor<10xf32>
-
- // GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[T0]]
- // GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]]
-
- // MATMUL the input fill has been fused.
- // GENERIC: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
- // GENERIC-SAME: %[[IV1]], %[[IV0]]
- // GENERIC: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
- // GENERIC: %[[T3:.*]] = tensor.extract_slice %[[ARG3]]
- // GENERIC-SAME: %[[IV1]]
- // GENERIC: linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]]
- %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) {
- ^bb0(%arg2: f32, %arg3: f32):
- %3 = arith.addf %arg2, %arg3 : f32
- linalg.yield %3 : f32
- } -> tensor<10xf32>
- func.return %2 : tensor<10xf32>
-}
-
-// -----
-
-// GENERIC-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)>
-// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (-d0 - d1 + 17, 8)>
-// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (-d1 - d2 + 17, d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d0 + d1)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-// GENERIC: fuse_non_rectangular
-// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32>
-func.func @fuse_non_rectangular(%arg0: tensor<10x17xf32>,
- %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> {
-
- // GENERIC-DAG: %[[C0:.*]] = arith.constant 0 : index
- // GENERIC-DAG: %[[C4:.*]] = arith.constant 4 : index
- // GENERIC-DAG: %[[C5:.*]] = arith.constant 5 : index
- // GENERIC-DAG: %[[C8:.*]] = arith.constant 8 : index
- // GENERIC-DAG: %[[C10:.*]] = arith.constant 10 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32>
-
- // GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]]
- // GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]]
-
- // Compute producer on a hyper rectangular bounding box. Along the second dimenson,
- // the offset is set to the sum of the induction variables, and the upper bound
- // to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the
- // induction variables.
- // GENERIC-DAG: %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]]
- // GENERIC-DAG: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]]
- // GENERIC-DAG: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]]
- // GENERIC: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
- // GENERIC-SAME: %[[IV1]], %[[SUM]]
- // GENERIC-SAME: , %[[UB1]]
- // GENERIC: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]]
- %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) {
- ^bb0(%arg2: f32, %arg3: f32):
- %2 = arith.addf %arg2, %arg3 : f32
- linalg.yield %2 : f32
- } -> tensor<10x8xf32>
- func.return %1 : tensor<10x8xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
deleted file mode 100644
index 67b2c606f3648..0000000000000
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir
+++ /dev/null
@@ -1,84 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.conv_2d fuse tile-sizes=4,4,0,0 tile-interchange=0,1,2,3 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=CONV %s
-// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=4,4,0 tile-interchange=0,1,2 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s
-
-// CONV: fuse_conv_chain
-// CONV-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<2x2xf32>
-// CONV-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<11x11xf32>
-// CONV-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<10x10xf32>
-// CONV-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<9x9xf32>
-// CONV-SAME: %[[ARG4:[0-9a-zA-Z]*]]: tensor<8x8xf32>
-func.func @fuse_conv_chain(%arg0: tensor<2x2xf32>,
- %arg1: tensor<11x11xf32>,
- %arg2: tensor<10x10xf32>,
- %arg3: tensor<9x9xf32>,
- %arg4: tensor<8x8xf32>) -> tensor<8x8xf32> {
- %cst = arith.constant 1.0 : f32
-
- // Do not tile the filter fill since the filter dimensions are not tiled.
- // CONV: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]]
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32>
-
- // Fuse all other operations.
- // CONV: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[ARG4]]
- // CONV: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG6:.*]] = %[[ARG5]]
-
- // CONV: %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
- // CONV-SAME: %[[IV0]], %[[IV1]]
- // CONV: %[[T2:.*]] = tensor.extract_slice %[[ARG2]]
- // CONV-SAME: %[[IV0]], %[[IV1]]
- // CONV: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]]
- // CONV: %[[T4:.*]] = linalg.conv_2d ins(%[[T1]], %[[T0]] : {{.*}} outs(%[[T3]]
- %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<10x10xf32>) -> tensor<10x10xf32>
- %2 = linalg.conv_2d ins(%arg1, %0 : tensor<11x11xf32>, tensor<2x2xf32>) outs(%1 : tensor<10x10xf32>) -> tensor<10x10xf32>
-
- // CONV: %[[T5:.*]] = tensor.extract_slice %[[ARG3]]
- // CONV-SAME: %[[IV0]], %[[IV1]]
- // CONV: %[[T6:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T5]]
- // CONV: %[[T7:.*]] = linalg.conv_2d ins(%[[T4]], %[[T0]] : {{.*}} outs(%[[T6]]
- %3 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<9x9xf32>) -> tensor<9x9xf32>
- %4 = linalg.conv_2d ins(%2, %0 : tensor<10x10xf32>, tensor<2x2xf32>) outs(%3 : tensor<9x9xf32>) -> tensor<9x9xf32>
-
- // Use the argument passed in by iteration argument.
- // CONV: %[[T8:.*]] = tensor.extract_slice %[[ARG6]]
- // CONV-SAME: %[[IV0]], %[[IV1]]
- // CONV: %[[T9:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T8]]
- // CONV: %[[T5:.*]] = linalg.conv_2d ins(%[[T7]], %[[T0]] {{.*}} outs(%[[T9]]
- %5 = linalg.fill ins(%cst : f32) outs(%arg4 : tensor<8x8xf32>) -> tensor<8x8xf32>
- %6 = linalg.conv_2d ins(%4, %0 : tensor<9x9xf32>, tensor<2x2xf32>) outs(%5 : tensor<8x8xf32>) -> tensor<8x8xf32>
- return %6 : tensor<8x8xf32>
-}
-
-// -----
-
-// MATMUL: fuse_matmul_chain
-// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<8x8xf32>
-func.func @fuse_matmul_chain(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
- %c0 = arith.constant 0 : index
- %c12 = arith.constant 12 : index
- %c25 = arith.constant 25 : index
- %c24 = arith.constant 24 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
-
- // Do not tile rhs fill of the producer matmul since none of its loop dimension is tiled.
- // MATMUL: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]]
- %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<8x8xf32>) -> tensor<8x8xf32>
-
- // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG1:.*]] = %[[ARG0]]
- // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[ARG1]]
-
- // Only the outermost loop of the producer matmul is tiled.
- // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]]
- // MATMUL-SAME: %[[IV0]], 0
- // MATMUL: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]]
- // MATMUL: %[[T3:.*]] = linalg.matmul ins(%[[T2]], %[[T0]] {{.*}}
- %1 = linalg.matmul ins(%0, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>
-
- // Use the argument passed in by iteration argument.
- // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
- // MATMUL-SAME: %[[IV0]], %[[IV1]]
- // MATMUL: %[[T5:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T4]]
- // MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T5]]
- %2 = linalg.matmul ins(%1, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32>
- return %2 : tensor<8x8xf32>
-}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
deleted file mode 100644
index 7e669c10a274e..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
-// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=memref.copy register-tile-sizes=4,32 vectorize" | \
-
-// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-vector-to-scf,lower-affine,convert-linalg-to-loops)" | \
-// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts" | \
-// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
-// Activate to dump assembly
-// R_UN: -dump-object-file -object-filename=/tmp/a.o \
-// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
-// Use tee to both print to stderr and FileCheck
-// RUN: tee -a /dev/stderr | FileCheck %s
-
-
-!elem_type_a = f32
-!elem_type_b = f32
-!elem_type_c = f32
-!row_major_A = memref<${M}x${K}x!elem_type_a>
-!row_major_B = memref<${K}x${N}x!elem_type_b>
-!row_major_C = memref<${M}x${N}x!elem_type_c>
-
-func.func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
-// TODO: activate manually for now.
-// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
-{
- linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
- outs(%c: !row_major_C)
- return
-}
-
-func.func @print_perf(%iters: index, %total_time: f64) {
- %c2 = arith.constant 2 : index
- %cM = arith.constant ${M} : index
- %cN = arith.constant ${N} : index
- %cK = arith.constant ${K} : index
-
- %mn = arith.muli %cM, %cN : index
- %mnk = arith.muli %mn, %cK : index
-
- // 2*M*N*K.
- %flops_per_iter = arith.muli %c2, %mnk : index
- %flops = arith.muli %iters, %flops_per_iter : index
- %flops_i64 = arith.index_cast %flops : index to i64
- %flops_f = arith.sitofp %flops_i64 : i64 to f64
- %flops_per_s = arith.divf %flops_f, %total_time : f64
- vector.print %flops_per_s : f64
-
- return
-}
-
-func.func @main() {
- %v0 = arith.constant 0.0 : !elem_type_a
- %v1 = arith.constant 1.0 : !elem_type_a
-
- %A = memref.alloc() : !row_major_A
- %B = memref.alloc() : !row_major_B
- %C = memref.alloc() : !row_major_C
-
- linalg.fill ins(%v1 : !elem_type_a) outs(%A : !row_major_A)
- linalg.fill ins(%v1 : !elem_type_b) outs(%B : !row_major_B)
- linalg.fill ins(%v0 : !elem_type_c) outs(%C : !row_major_C)
-
- %c0 = arith.constant 0: index
- %c1 = arith.constant 1: index
- %iters = arith.constant ${ITERS}: index
-
- /// Run and dump performance for matmul.
- /// Preheating run:
- scf.for %arg0 = %c0 to %iters step %c1 {
- %z = arith.constant 0.0 : !elem_type_c
- linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C)
- func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
- }
- %t_start_matmul = call @rtclock() : () -> f64
- scf.for %arg0 = %c0 to %iters step %c1 {
- // linalg.matmul writes %C in place, need to reset it to zero every time.
- // This is accounts for about 10-15% perf hit on small sizes.
- // Once linalg on tensors is ready, fusing fill at the register level will
- // be easy.
- %z = arith.constant 0.0 : !elem_type_c
- linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C)
- func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
- }
- %t_end_matmul = call @rtclock() : () -> f64
- %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
- call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
-
- // CHECK: {{^0$}}
- %C_ref = memref.alloc() : !row_major_C
- linalg.fill ins(%v0 : !elem_type_c) outs(%C_ref : !row_major_C)
- linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
- outs(%C_ref: !row_major_C)
- %act = memref.cast %C : !row_major_C to memref<*xf32>
- %exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
- %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
- vector.print %errors : i64
- memref.dealloc %C_ref : !row_major_C
-
- memref.dealloc %A : !row_major_A
- memref.dealloc %B : !row_major_B
- memref.dealloc %C : !row_major_C
-
- return
-}
-
-func.func private @rtclock() -> f64
-func.func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }
-
-// TODO: init with random, run and check output.
-// func private @fill_random_f32(memref<*xf32>)
diff --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index b85b88ad8a71f..e11c49fe6d696 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -1,6 +1,5 @@
# Exclude tests from libMLIR.so
add_mlir_library(MLIRLinalgTestPasses
- TestLinalgCodegenStrategy.cpp
TestLinalgElementwiseFusion.cpp
TestLinalgFusionTransforms.cpp
TestLinalgHoisting.cpp
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
deleted file mode 100644
index efe8f38bc23bc..0000000000000
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-//===- TestLinalgCodegenStrategy.cpp - Test Linalg codegen strategy -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements logic for testing the Linalg codegen strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#include <utility>
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
-
-#include "llvm/ADT/SetVector.h"
-
-using namespace mlir;
-using namespace mlir::linalg;
-
-namespace {
-struct TestLinalgCodegenStrategy
- : public PassWrapper<TestLinalgCodegenStrategy,
- OperationPass<func::FuncOp>> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestLinalgCodegenStrategy)
-
- StringRef getArgument() const final { return "test-linalg-codegen-strategy"; }
- StringRef getDescription() const final {
- return "Test Linalg Codegen Strategy.";
- }
- TestLinalgCodegenStrategy() = default;
- TestLinalgCodegenStrategy(const TestLinalgCodegenStrategy &pass)
- : PassWrapper(pass) {}
-
- void getDependentDialects(DialectRegistry ®istry) const override {
- // clang-format off
- registry.insert<AffineDialect,
- gpu::GPUDialect,
- linalg::LinalgDialect,
- memref::MemRefDialect,
- scf::SCFDialect,
- vector::VectorDialect>();
- // clang-format on
- }
-
- template <typename LinalgNamedOp>
- void applyStrategyToNamedLinalgOp();
-
- void runOnOperation() override;
-
- void runStrategy(const LinalgTilingAndFusionOptions &tilingAndFusionOptions,
- const LinalgTilingOptions &tilingOptions,
- const LinalgTilingOptions ®isterTilingOptions,
- LinalgPaddingOptions paddingOptions,
- vector::VectorContractLowering vectorContractLowering,
- vector::VectorTransferSplit vectorTransferSplit);
-
- Option<bool> fuse{
- *this, "fuse",
- llvm::cl::desc("Fuse the producers after tiling the root op."),
- llvm::cl::init(false)};
- ListOption<int64_t> tileSizes{*this, "tile-sizes",
- llvm::cl::desc("Specifies the tile sizes.")};
- ListOption<int64_t> tileInterchange{
- *this, "tile-interchange",
- llvm::cl::desc("Specifies the tile interchange.")};
-
- Option<bool> promote{
- *this, "promote",
- llvm::cl::desc("Promote the tile into a small aligned memory buffer."),
- llvm::cl::init(false)};
- Option<bool> promoteFullTile{
- *this, "promote-full-tile-pad",
- llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."),
- llvm::cl::init(false)};
- ListOption<int64_t> registerTileSizes{
- *this, "register-tile-sizes",
- llvm::cl::desc(
- "Specifies the size of the register tile that will be used "
- " to vectorize")};
- Option<bool> registerPromote{
- *this, "register-promote",
- llvm::cl::desc(
- "Promote the register tile into a small aligned memory buffer."),
- llvm::cl::init(false)};
- Option<bool> registerPromoteFullTile{
- *this, "register-promote-full-tile-pad",
- llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."),
- llvm::cl::init(false)};
- Option<bool> pad{*this, "pad", llvm::cl::desc("Pad the operands."),
- llvm::cl::init(false)};
- ListOption<std::string> paddingValues{
- *this, "padding-values",
- llvm::cl::desc("Operand padding values parsed by the attribute parser.")};
- ListOption<int64_t> paddingDimensions{
- *this, "padding-dimensions",
- llvm::cl::desc("Operation iterator dimensions to pad.")};
- ListOption<int64_t> packPaddings{*this, "pack-paddings",
- llvm::cl::desc("Operand packing flags.")};
- ListOption<int64_t> hoistPaddings{*this, "hoist-paddings",
- llvm::cl::desc("Operand hoisting depths.")};
- ListOption<SmallVector<int64_t>> transposePaddings{
- *this, "transpose-paddings",
- llvm::cl::desc(
- "Transpose paddings. Specify a operand dimension interchange "
- "using the following format:\n"
- "-transpose-paddings=[1,0,2],[0,1],[0,1]\n"
- "It defines the interchange [1, 0, 2] for operand one and "
- "the interchange [0, 1] (no transpose) for the remaining operands."
- "All interchange vectors have to be permuations matching the "
- "operand rank.")};
- Option<bool> generalize{*this, "generalize",
- llvm::cl::desc("Generalize named operations."),
- llvm::cl::init(false)};
- ListOption<int64_t> iteratorInterchange{
- *this, "iterator-interchange",
- llvm::cl::desc("Specifies the iterator interchange.")};
- Option<bool> decompose{
- *this, "decompose",
- llvm::cl::desc("Decompose convolutions to lower dimensional ones."),
- llvm::cl::init(false)};
- Option<bool> vectorize{
- *this, "vectorize",
- llvm::cl::desc("Rewrite the linalg op as a vector operation."),
- llvm::cl::init(false)};
- Option<bool> vectorizePadding{
- *this, "vectorize-padding",
- llvm::cl::desc("Rewrite pad tensor ops as vector operations."),
- llvm::cl::init(false)};
- Option<std::string> splitVectorTransfersTo{
- *this, "split-transfers",
- llvm::cl::desc(
- "Split vector transfers between slow (masked) and fast "
- "(unmasked) variants. Possible options are:\n"
- "\tnone: keep unsplit vector.transfer and pay the full price\n"
- "\tmemref.copy: use linalg.fill + memref.copy for the slow path\n"
- "\tvector-transfers: use extra small unmasked vector.transfer for"
- " the slow path\n"),
- llvm::cl::init("none")};
- Option<std::string> vectorizeContractionTo{
- *this, "vectorize-contraction-to",
- llvm::cl::desc("the type of vector op to use for linalg contractions"),
- llvm::cl::init("outerproduct")};
- Option<bool> unrollVectorTransfers{
- *this, "unroll-vector-transfers",
- llvm::cl::desc("Enable full unrolling of vector.transfer operations"),
- llvm::cl::init(false)};
- Option<bool> runEnablePass{
- *this, "run-enable-pass",
- llvm::cl::desc("Run the enable pass between transformations"),
- llvm::cl::init(true)};
- Option<std::string> anchorOpName{
- *this, "anchor-op",
- llvm::cl::desc(
- "Which single linalg op is the anchor for the codegen strategy to "
- "latch on:\n"
- "\tlinalg.matmul: anchor on linalg.matmul\n"
- "\tlinalg.matmul_column_major: anchor on linalg.matmul_column_major\n"
- "\tmemref.copy: anchor on memref.copy\n"
- "\tlinalg.fill: anchor on linalg.fill\n"),
- llvm::cl::init("")};
- Option<std::string> anchorFuncOpName{
- *this, "anchor-func",
- llvm::cl::desc(
- "Which single func op is the anchor for the codegen strategy to "
- "latch on."),
- llvm::cl::init("")};
-};
-
-void TestLinalgCodegenStrategy::runStrategy(
- const LinalgTilingAndFusionOptions &tilingAndFusionOptions,
- const LinalgTilingOptions &tilingOptions,
- const LinalgTilingOptions ®isterTilingOptions,
- LinalgPaddingOptions paddingOptions,
- vector::VectorContractLowering vectorContractLowering,
- vector::VectorTransferSplit vectorTransferSplit) {
- std::string anchorOpNameOrWildcard = fuse ? "" : anchorOpName.getValue();
- CodegenStrategy strategy;
- strategy
- .tileAndFuseIf(fuse && !tileSizes.empty(), anchorOpName,
- tilingAndFusionOptions)
- .tileIf(!fuse && !tileSizes.empty(), anchorOpName, tilingOptions)
- .promoteIf(!fuse && promote, anchorOpName,
- LinalgPromotionOptions()
- .setAlignment(16)
- .setUseFullTileBuffersByDefault(promoteFullTile))
- .tileIf(!fuse && !registerTileSizes.empty(), anchorOpName,
- registerTilingOptions)
- .promoteIf(!fuse && registerPromote, anchorOpName,
- LinalgPromotionOptions()
- .setAlignment(16)
- .setUseFullTileBuffersByDefault(registerPromoteFullTile))
- .padIf(pad, anchorOpNameOrWildcard, std::move(paddingOptions))
- .decomposeIf(decompose)
- .generalizeIf(generalize, anchorOpNameOrWildcard)
- .interchangeIf(!iteratorInterchange.empty(), iteratorInterchange)
- .vectorizeIf(vectorize, anchorOpNameOrWildcard, nullptr, vectorizePadding)
- .vectorLowering(
- LinalgVectorLoweringOptions()
- .setVectorTransformsOptions(
- vector::VectorTransformsOptions()
- .setVectorTransformsOptions(vectorContractLowering)
- .setVectorTransferSplit(vectorTransferSplit))
- .setVectorTransferToSCFOptions(
- VectorTransferToSCFOptions().enableFullUnroll(
- unrollVectorTransfers))
- .enableTransferPartialRewrite()
- .enableContractionLowering()
- .enableTransferToSCFConversion());
- // Created a nested OpPassManager and run.
- func::FuncOp funcOp = getOperation();
- OpPassManager dynamicPM("func.func");
- strategy.configurePassPipeline(dynamicPM, funcOp.getContext(), runEnablePass);
- if (failed(runPipeline(dynamicPM, funcOp)))
- return signalPassFailure();
-}
-} // namespace
-
-/// Apply transformations specified as patterns.
-void TestLinalgCodegenStrategy::runOnOperation() {
- if (!anchorFuncOpName.empty() && anchorFuncOpName != getOperation().getName())
- return;
-
- LinalgTilingAndFusionOptions tilingAndFusionOptions;
- tilingAndFusionOptions.tileSizes = {tileSizes.begin(), tileSizes.end()};
- tilingAndFusionOptions.tileInterchange = {tileInterchange.begin(),
- tileInterchange.end()};
-
- LinalgTilingOptions tilingOptions;
- if (!tileSizes.empty())
- tilingOptions = tilingOptions.setTileSizes(tileSizes);
- if (!tileInterchange.empty())
- tilingOptions = tilingOptions.setInterchange(
- SmallVector<unsigned>(tileInterchange.begin(), tileInterchange.end()));
-
- LinalgTilingOptions registerTilingOptions;
- if (!registerTileSizes.empty())
- registerTilingOptions =
- registerTilingOptions.setTileSizes(registerTileSizes);
-
- // Parse the padding values.
- SmallVector<Attribute> paddingValueAttributes;
- for (const std::string &paddingValue : paddingValues) {
- paddingValueAttributes.push_back(
- parseAttribute(paddingValue, &getContext()));
- }
-
- // Parse the transpose vectors.
- LinalgPaddingOptions paddingOptions;
- paddingOptions.setPaddingValues(paddingValueAttributes);
- paddingOptions.setPaddingDimensions(
- SmallVector<int64_t>{paddingDimensions.begin(), paddingDimensions.end()});
- paddingOptions.setPackPaddings(
- SmallVector<bool>{packPaddings.begin(), packPaddings.end()});
- paddingOptions.setHoistPaddings(
- SmallVector<int64_t>{hoistPaddings.begin(), hoistPaddings.end()});
- paddingOptions.setTransposePaddings(transposePaddings);
-
- vector::VectorContractLowering vectorContractLowering =
- llvm::StringSwitch<vector::VectorContractLowering>(
- vectorizeContractionTo.getValue())
- .Case("matrixintrinsics", vector::VectorContractLowering::Matmul)
- .Case("dot", vector::VectorContractLowering::Dot)
- .Case("outerproduct", vector::VectorContractLowering::OuterProduct)
- .Default(vector::VectorContractLowering::OuterProduct);
- vector::VectorTransferSplit vectorTransferSplit =
- llvm::StringSwitch<vector::VectorTransferSplit>(
- splitVectorTransfersTo.getValue())
- .Case("none", vector::VectorTransferSplit::None)
- .Case("memref-copy", vector::VectorTransferSplit::LinalgCopy)
- .Case("vector-transfers", vector::VectorTransferSplit::VectorTransfer)
- .Default(vector::VectorTransferSplit::None);
-
- runStrategy(tilingAndFusionOptions, tilingOptions, registerTilingOptions,
- paddingOptions, vectorContractLowering, vectorTransferSplit);
-}
-
-namespace mlir {
-namespace test {
-void registerTestLinalgCodegenStrategy() {
- PassRegistration<TestLinalgCodegenStrategy>();
-}
-} // namespace test
-} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index c9b1b492eefaf..778c569c5ce16 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -86,7 +86,6 @@ void registerTestGenericIRVisitorsPass();
void registerTestGenericIRVisitorsInterruptPass();
void registerTestInterfaces();
void registerTestLastModifiedPass();
-void registerTestLinalgCodegenStrategy();
void registerTestLinalgElementwiseFusion();
void registerTestLinalgFusionTransforms();
void registerTestLinalgTensorFusionTransforms();
@@ -185,7 +184,6 @@ void registerTestPasses() {
mlir::test::registerTestGenericIRVisitorsPass();
mlir::test::registerTestInterfaces();
mlir::test::registerTestLastModifiedPass();
- mlir::test::registerTestLinalgCodegenStrategy();
mlir::test::registerTestLinalgElementwiseFusion();
mlir::test::registerTestLinalgFusionTransforms();
mlir::test::registerTestLinalgTensorFusionTransforms();
More information about the Mlir-commits
mailing list