[Mlir-commits] [mlir] 9fbcad3 - [mlir][linalg] Improve the padding packing loop computation.

Mon Nov 8 02:21:44 PST 2021

Author: Tobias Gysi
Date: 2021-11-08T10:20:33Z
New Revision: 9fbcad3298569c8c3e0fab3240ecbd2748becb2e

URL: https://github.com/llvm/llvm-project/commit/9fbcad3298569c8c3e0fab3240ecbd2748becb2e
DIFF: https://github.com/llvm/llvm-project/commit/9fbcad3298569c8c3e0fab3240ecbd2748becb2e.diff

LOG: [mlir][linalg] Improve the padding packing loop computation.

The revision updates the packing loop search in hoist padding. Instead of considering all loops in the backward slice, we now compute a separate backward slice containing the index computations only. This modification ensures we do not add packing loops that are not used to index the packed buffer due to spurious dependencies. One instance where such spurious dependencies can appear is the extract slice operation introduced between the tile loops of a double tiling.

Depends On D112412

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D112713

Added: 
    

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
    mlir/test/Dialect/Linalg/hoist-padding.mlir
    mlir/test/Dialect/Linalg/pad-and-hoist.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index 5346e236b167..871c369eaa63 100644

--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -48,6 +48,10 @@ using namespace mlir::linalg;
 ///      contains an unknown op with a region.
 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above is
 ///      empty.
+///   5. The source tensor of pad op is not defined by an extract slice op.
+///   6. The source tensor of the extract slice op is not defined outside of
+///      the outermost enclosing scf::ForOp.
+///   7. There is no enclosing scf::ForOp that indexes the padded data.
 /// Other cases succeed and will trigger hoisting of the pad op.
 struct HoistingAnalysis {
   HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
@@ -82,6 +86,26 @@ struct HoistingAnalysis {
       packingLoops;
 
 private:
+  /// Returns the loops in `backwardSlice` used to index the padded data. The
+  /// method starts from `padTensorOp` and `sliceOp`, follows the use-def
+  /// chains of their index operands, and stores any enclosing loop whose
+  /// induction variable is part of the walked index computation.
+  ///
+  /// Example:
+  /// ```
+  /// %source = linalg.fill(%cst, %arg0)
+  /// scf.for %i
+  ///   scf.for %j
+  ///     scf.for %k // not used to index %source!
+  ///       %ubi = affine.min #map(%i)
+  ///       %ubj = affine.min #map(%j)
+  ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
+  ///       %padded_slice = linalg.pad_tensor %slice
+  /// ```
+  /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j]
+  SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp,
+                                          tensor::ExtractSliceOp sliceOp);
+
   /// Encodes whether the analysis is valid and hoisting can proceed.
   bool valid;
 };
@@ -166,28 +190,114 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
   if (analysisFailure || backwardSlice.empty())
     return;
 
-  // Backward slice is a topologically sorted list of ops starting at
-  // `outermostEnclosingForOp`.
-  assert(outermostEnclosingForOp == backwardSlice.front());
+  // Get the `sliceOp` that defines the source tensor of `padTensorOp` and
+  // check its source is defined outside of the outermost loop. This check
+  // ensures the padded data is available for packing before entering the
+  // outermost enclosing loop.
+  //
+  // Example:
+  // ```
+  // %source = linalg.fill(%cst, %arg0)
+  // // %source is available for packing here!
+  // scf.for %i
+  //   scf.for %j
+  //     scf.for %k
+  //       %slice = tensor.extract_slice %source [%i, %j]
+  //       %padded_slice = linalg.pad_tensor %slice
+  // ```
+  auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
+  if (!sliceOp) {
+    LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
+    return;
+  }
+  if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
+    LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
+    return;
+  }
 
-  // Filter out the loops whose induction variable is not used to compute the
-  // padded result. As a first approximation, just look for IVs that have no use
-  // in the backwardSlice.
-  // These are the dimensions of reuse that we can exploit to reduce the amount
-  // of copy / memory.
+  // Search the loops found in `backwardSlice` used to index the padded data.
+  SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp);
+
+  // Add only the loops part of `indexingLoops` to the packing loops. All other
+  // loops are not used to index the padded data and consequently access the
+  // same data in every loop iteration. Adding them to the packing loops would
+  // increase the cache footprint of the packed data by storing the same data
+  // multiple times.
   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
-    for (Operation *user : forOp.getInductionVar().getUsers()) {
-      if (backwardSlice.contains(user)) {
-        packingLoops.insert(forOp);
-        break;
-      }
-    }
+    if (indexingLoops.contains(forOp))
+      packingLoops.insert(forOp);
+  }
+  assert(indexingLoops.size() == packingLoops.size() &&
+         "expect the all indexing loops are enclosing loops");
+  if (packingLoops.empty()) {
+    LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
+    return;
   }
 
   // The analysis is valid and hoisting can occur.
   valid = true;
 }
 
+SetVector<Operation *>
+HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
+                                   tensor::ExtractSliceOp sliceOp) {
+  // Set of all values used for index computation.
+  SetVector<Value> indexEdges;
+
+  // Helper function that adds all index operands of an operation to
+  // `indexEdges`. An operand is an index operand if it is of index type.
+  auto addIndexOperandsToIndexEdges = [&](Operation *op) {
+    for (Value operand : op->getOperands())
+      if (operand.getType().isIndex())
+        indexEdges.insert(operand);
+  };
+
+  // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
+  // type in `backwardSlice`. Add the index operands of an operation to
+  // `indexEdges` if one of its results is an index edge found so far and store
+  // all loops part of the index computation to `indexingLoops`.
+  //
+  // Example:
+  // ```
+  // %source = linalg.fill(%cst, %arg0)
+  // scf.for %i
+  //   scf.for %j
+  //     scf.for %k // not used to index %source!
+  //       %ubi = affine.min #map(%i)
+  //       %ubj = affine.min #map(%j)
+  //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
+  //       %padded_slice = linalg.pad_tensor %slice
+  // ```
+  // After iterating `backwardSlice` we obtain:
+  // indexEdges = [%i, %j, %ubi, %ubj]
+  // indexingLoops = [scf.for %i, scf.for %j]
+  SetVector<Operation *> indexingLoops;
+  for (Operation *op : llvm::reverse(backwardSlice)) {
+    // Add the index operands of `padTensorOp` and `sliceOp` to start the
+    // exploration of the index computation.
+    if (op == padTensorOp || op == sliceOp) {
+      addIndexOperandsToIndexEdges(op);
+      continue;
+    }
+    // Add the index operands of the loop if its induction variable is
+    // used for index computation. Additionally, insert the loop into
+    // `indexingLoops`
+    if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+      if (indexEdges.contains(forOp.getInductionVar())) {
+        addIndexOperandsToIndexEdges(op);
+        indexingLoops.insert(forOp);
+        continue;
+      }
+    }
+    // Add the index operands of all other operations if at least one result is
+    // used for index computation.
+    if (llvm::any_of(op->getResults(),
+                     [&](Value result) { return indexEdges.contains(result); }))
+      addIndexOperandsToIndexEdges(op);
+  }
+  return indexingLoops;
+}
+
 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
 }

diff  --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
index b6c0eed3f111..d29e11da5dcf 100644
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -141,29 +141,25 @@ func @matmul_tensors(
 
 // -----
 
-
 // CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-// CHECK-DAG: #[[$MIN_REST4:[0-9a-z]+]] = affine_map<(d0, d1) -> (4, d0 - d1)>
-// CHECK-DAG: #[[$MIN_REST2:[0-9a-z]+]] = affine_map<(d0, d1) -> (2, d0 - d1)>
 // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
 // CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
 #map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
-#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+#map1 = affine_map<(d0, d1) -> (4, -d0 + d1)>
+#map2 = affine_map<(d0, d1) -> (2, -d0 + d1)>
+#map3 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
 
 // CHECK-LABEL: func @dot
 // VERIFIER-ONLY-LABEL: func @dot
 func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
     -> tensor<f32>
 {
+  %cst = arith.constant 0.000000e+00 : f32
   %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
   %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %1 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %2 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %3 = tensor.dim %arg1, %c0 : tensor<?xf32>
+  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
 
   //      CHECK: scf.for %[[I:[0-9a-z]+]] =
   //
@@ -194,39 +190,32 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
   //      CHECK:       %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
   //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
 
-  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
-    %5 = affine.min #map0(%arg3)[%2]
-    %6 = tensor.extract_slice %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
-    %7 = affine.min #map0(%arg3)[%3]
-    %8 = tensor.extract_slice %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
-    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
-      %10 = affine.min #map1(%5, %arg5)
-      %11 = tensor.extract_slice %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
-      %12 = affine.min #map1(%7, %arg5)
-      %13 = tensor.extract_slice %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
-      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
-        %15 = affine.min #map2(%10, %arg7)
-        %16 = tensor.extract_slice %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
-        %17 = affine.min #map2(%12, %arg7)
-        %18 = tensor.extract_slice %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
-        %19 = arith.subi %c2, %15 : index
-        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+  %1 = scf.for %arg3 = %c0 to %0 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %2 = affine.min #map0(%arg3)[%0]
+    %3 = scf.for %arg5 = %c0 to %2 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %4 = affine.min #map1(%arg5, %2)
+      %5 = scf.for %arg7 = %c0 to %4 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %6 = affine.min #map2(%arg7, %4)
+        %7 = affine.apply #map3(%arg7, %arg5, %arg3)
+        %8 = tensor.extract_slice %arg0[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
+        %9 = tensor.extract_slice %arg1[%7] [%6] [1] : tensor<?xf32> to tensor<?xf32>
+        %10 = arith.subi %c2, %6 : index
+        %11 = linalg.pad_tensor %8 low[%c0] high[%10]  {
         ^bb0(%arg9: index):  // no predecessors
           linalg.yield %cst : f32
         } : tensor<?xf32> to tensor<2xf32>
-        %21 = arith.subi %c2, %17 : index
-        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        %12 = linalg.pad_tensor %9 low[%c0] high[%10]  {
         ^bb0(%arg9: index):  // no predecessors
           linalg.yield %cst : f32
         } : tensor<?xf32> to tensor<2xf32>
-        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
-        scf.yield %23 : tensor<f32>
+        %13 = linalg.dot ins(%11, %12 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %13 : tensor<f32>
       }
-      scf.yield %14 : tensor<f32>
+      scf.yield %5 : tensor<f32>
     }
-    scf.yield %9 : tensor<f32>
+    scf.yield %3 : tensor<f32>
   }
-  return %4 : tensor<f32>
+  return %1 : tensor<f32>
 }
 
 // -----

diff  --git a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
index 93e2bf5f189d..d8deb31d09a8 100644
--- a/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
+++ b/mlir/test/Dialect/Linalg/pad-and-hoist.mlir
@@ -1,25 +1,23 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
+// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=3,2,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE
 
 // CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)>
-// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)>
 // CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (5, -d0 + 24)>
-#map1 = affine_map<(d0) -> (8, -d0 + 12)>
-#map2 = affine_map<(d0) -> (7, -d0 + 25)>
-
-//      CHECK:  single_tiling
-//      CHECK-DOUBLE:  single_tiling
+#map1 = affine_map<(d0) -> (7, -d0 + 25)>
 
+//      CHECK:  static_sizes
+//      CHECK-DOUBLE:  static_sizes
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
-func @single_tiling(%arg0: tensor<24x12xf32>,
-                    %arg1: tensor<12x25xf32>,
-                    %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
+func @static_sizes(%arg0: tensor<24x12xf32>,
+                   %arg1: tensor<12x25xf32>,
+                   %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> {
   //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
-  //  CHECK-DAG: %[[C8:.*]] = arith.constant 8
+  //  CHECK-DAG: %[[C7:.*]] = arith.constant 7
   %c0 = arith.constant 0 : index
   %c12 = arith.constant 12 : index
   %c25 = arith.constant 25 : index
@@ -33,11 +31,11 @@ func @single_tiling(%arg0: tensor<24x12xf32>,
 
     // Packing the first input operand for all values of IV2 (IV2x5x6).
     //      CHECK:  = linalg.init_tensor [2, 5, 6]
-    //      CHECK:  %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] =
-    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]])
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[PIV0]])
     //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])
     //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
-    //   CHECK-SAME:                                     %[[IV0]], %[[P0IV2]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
     //   CHECK-SAME:                                     %[[TS0]], 6
     //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
     //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]]
@@ -47,15 +45,15 @@ func @single_tiling(%arg0: tensor<24x12xf32>,
     //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
 
-      // Packing the second input operand for all values of IV2 (IV2x6x8).
-      //      CHECK:  = linalg.init_tensor [2, 6, 8]
-      //      CHECK:  %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] =
-      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]])
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [2, 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
       //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])
       //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
-      //   CHECK-SAME:                                     %[[P1IV2]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
       //   CHECK-SAME:                                     6, %[[TS1]]
-      //        CHECK:   %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]]
+      //        CHECK:   %[[V1:.*]] = arith.subi %[[C7]], %[[TS1]]
       //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]]
       //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
       //        CHECK:   scf.yield %[[T5:.*]]
@@ -63,6 +61,7 @@ func @single_tiling(%arg0: tensor<24x12xf32>,
       //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
       %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
         %3 = affine.min #map0(%arg3)
+
         // Index the packed operands.
         //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]])
         //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
@@ -91,6 +90,108 @@ func @single_tiling(%arg0: tensor<24x12xf32>,
 
 // -----
 
+// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+// CHECK-DAG: #[[SDIV6:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 6)>
+// CHECK-DAG: #[[DDIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
+#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)>
+#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)>
+
+//      CHECK:  dynamic_sizes
+//      CHECK-DOUBLE:  dynamic_sizes
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<?x?xf32>
+func @dynamic_sizes(%arg0: tensor<?x?xf32>,
+                    %arg1: tensor<?x?xf32>,
+                    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[C1:.*]] = arith.constant 1
+  //  CHECK-DAG: %[[C5:.*]] = arith.constant 5
+  //  CHECK-DAG: %[[C6:.*]] = arith.constant 6
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c6 = arith.constant 6 : index
+  %c7 = arith.constant 7 : index
+  %c5 = arith.constant 5 : index
+
+  //  CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+  //  CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+  //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]]
+  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+
+  //      CHECK:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
+  %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+
+    // Packing the first input operand for all values of IV2 (IV2x5x6).
+    //      CHECK:  %[[PS0:.*]] = affine.apply #[[SDIV6]]()[%[[D1]]
+    //      CHECK:  = linalg.init_tensor [%[[PS0]], 5, 6]
+    //      CHECK:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //        CHECK:   %[[PIDX0:.*]] = affine.apply #[[DDIV6]](%[[PIV0]])
+    //        CHECK:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]
+    //        CHECK:   %[[TS1:.*]] = affine.min #[[MAP1]](%[[PIV0]])[%[[D1]]
+    //        CHECK:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //   CHECK-SAME:                                     %[[IV0]], %[[PIV0]]
+    //   CHECK-SAME:                                     %[[TS0]], %[[TS1]]
+    //        CHECK:   %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]]
+    //        CHECK:   %[[V1:.*]] = arith.subi %[[C6]], %[[TS1]]
+    //        CHECK:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]], %[[V1]]
+    //        CHECK:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //        CHECK:   scf.yield %[[T2:.*]]
+
+    //      CHECK:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
+    %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+
+      // Packing the second input operand for all values of IV2 (IV2x6x7).
+      //      CHECK:  = linalg.init_tensor [%[[PS0]], 6, 7]
+      //      CHECK:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //        CHECK:   %[[PIDX1:.*]] = affine.apply #[[DDIV6]](%[[PIV1]])
+      //        CHECK:   %[[TS2:.*]] = affine.min #[[MAP1]](%[[PIV1]])[%[[D1]]
+      //        CHECK:   %[[TS3:.*]] = affine.min #[[MAP2]](%[[IV1]])[%[[D2]]
+      //        CHECK:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //   CHECK-SAME:                                     %[[PIV1]], %[[IV1]]
+      //   CHECK-SAME:                                     %[[TS2]], %[[TS3]]
+      //        CHECK:   %[[V2:.*]] = arith.subi %[[C6]], %[[TS2]]
+      //        CHECK:   %[[V3:.*]] = arith.subi %[[C7]], %[[TS3]]
+      //        CHECK:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[V2]], %[[V3]]
+      //        CHECK:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //        CHECK:   scf.yield %[[T5:.*]]
+
+      //      CHECK:  scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] =
+      %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
+        %6 = affine.min #map0(%arg3)[%0]
+        %7 = affine.min #map1(%arg7)[%1]
+
+        // Index the packed operands.
+        //    CHECK-DAG:   %[[IDX:.*]] = affine.apply #[[DDIV6]](%[[IV2]])
+        //    CHECK-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]]
+        //    CHECK-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]]
+        %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %9 = affine.min #map2(%arg5)[%2]
+        %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+        %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+
+        // Check matmul uses the packed input operands.
+        //        CHECK:   = linalg.matmul ins(%[[T6]], %[[T7]]
+        %12 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%8, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+        scf.yield %13 : tensor<?x?xf32>
+      }
+      scf.yield %5 : tensor<?x?xf32>
+    }
+    scf.yield %4 : tensor<?x?xf32>
+  }
+  return %3 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-DOUBLE-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)>
+// CHECK-DOUBLE-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)>
 #map0 = affine_map<(d0) -> (15, -d0 + 24)>
 #map1 = affine_map<(d0) -> (16, -d0 + 25)>
 #map2 = affine_map<(d0, d1) -> (5, -d0 + d1)>
@@ -99,7 +200,6 @@ func @single_tiling(%arg0: tensor<24x12xf32>,
 
 //      CHECK:  double_tiling
 //      CHECK-DOUBLE:  double_tiling
-
 // CHECK-DOUBLE-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>
 // CHECK-DOUBLE-SAME:    %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>
 // CHECK-DOUBLE-SAME:    %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>
@@ -114,16 +214,17 @@ func @double_tiling(%arg0: tensor<24x12xf32>,
   %c5 = arith.constant 5 : index
   %c6 = arith.constant 6 : index
 
-  // Packing the first input operand.
-  //    CHECK-DOUBLE:  = linalg.init_tensor
-  //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
-
   //    CHECK-DOUBLE:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) {
 
-    // Packing the second input operand.
-    //    CHECK-DOUBLE:  = linalg.init_tensor
-    //    CHECK-DOUBLE:  = linalg.pad_tensor {{.*}} nofold
+    // Packing the first input operand.
+    //    CHECK-DOUBLE:  = linalg.init_tensor [3, 5, 12]
+    //    CHECK-DOUBLE:  %[[PT0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
+    //      CHECK-DOUBLE:   %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]])
+    //      CHECK-DOUBLE:   %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
+    //      CHECK-DOUBLE:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
+    //      CHECK-DOUBLE:   %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0]
+    //      CHECK-DOUBLE:   scf.yield %[[T2:.*]]
 
     //    CHECK-DOUBLE:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) {
@@ -131,6 +232,15 @@ func @double_tiling(%arg0: tensor<24x12xf32>,
       %3 = affine.min #map1(%arg5)
       %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
 
+      // Packing the second input operand.
+      //    CHECK-DOUBLE:  = linalg.init_tensor [%{{.*}}, 12, 6]
+      //    CHECK-DOUBLE:  %[[PT1:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
+      //      CHECK-DOUBLE:   %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[PIV1]])
+      //      CHECK-DOUBLE:   %[[T3:.*]] = tensor.extract_slice %[[ARG1]]
+      //      CHECK-DOUBLE:   %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold
+      //      CHECK-DOUBLE:   %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0]
+      //      CHECK-DOUBLE:   scf.yield %[[T5:.*]]
+
       //    CHECK-DOUBLE:  scf.for %[[IV2:[0-9a-zA-Z]*]] =
       %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) {
 
@@ -138,6 +248,12 @@ func @double_tiling(%arg0: tensor<24x12xf32>,
         %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) {
           %8 = affine.min #map2(%arg7, %2)
           %9 = affine.apply #map3(%arg7, %arg3)
+
+          // Index the packed operands.
+          //    CHECK-DOUBLE-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV2]])
+          //    CHECK-DOUBLE-DAG:   %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX0]]
+          //    CHECK-DOUBLE-DAG:   %[[IDX1:.*]] = affine.apply #[[DIV6]](%[[IV3]])
+          //    CHECK-DOUBLE-DAG:   %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX1]]
           %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32>
           %11 = affine.min #map4(%arg9, %3)
           %12 = affine.apply #map3(%arg9, %arg5)
@@ -146,9 +262,8 @@ func @double_tiling(%arg0: tensor<24x12xf32>,
           %15 = affine.min #map4(%arg9, %3)
           %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 
-          // Pad the output operand and perform the multiplication.
-          //        CHECK-DOUBLE:   = linalg.pad_tensor
-          //        CHECK-DOUBLE:   = linalg.matmul
+          // Check matmul uses the packed input operands.
+          //    CHECK-DOUBLE:   = linalg.matmul ins(%[[T6]], %[[T7]]
           %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
           %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
           scf.yield %18 : tensor<?x?xf32>