[Mlir-commits] [mlir] [mlir][transform] Plumb a simplified form of AffineMin folding into t… (PR #145170)

Sat Jun 21 07:12:07 PDT 2025

https://github.com/nicolasvasilache created https://github.com/llvm/llvm-project/pull/145170

…ransform.pad-tiling-interface

This revision introduces a simple variant of AffineMin folding in makeComposedFoldedAffineApply and makes use of it in transform.pad-tiling-interface. Since this version explicitly call ValueBoundsInterface, it may be too expensive and is only activate behind a flag.
It results in better foldings when mixing tiling and padding, including with dynamic shapes.

This should be further composed with #145068 to provide full simplification and address the remaining TODO in the test.

>From 856d708e2cb54fecc519db6a78029243ea409e34 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nico.vasilache at amd.com>
Date: Wed, 18 Jun 2025 19:14:31 +0200
Subject: [PATCH] [mlir][transform] Plumb a simplified form of AffineMin
 folding into transform.pad-tiling-interface

This revision introduces a simple variant of AffineMin folding in makeComposedFoldedAffineApply
and makes use of it in transform.pad-tiling-interface.
Since this version explicitly call ValueBoundsInterface, it may be too expensive and is
only activate behind a flag.
It results in better foldings when mixing tiling and padding, including with dynamic shapes.

This should be further composed with #145068 to provide full simplification and address
the remaining TODO in the test.
---
 .../mlir/Dialect/Affine/IR/AffineOps.h        |  18 ++-
 .../mlir/Interfaces/ValueBoundsOpInterface.h  |   2 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp      | 134 ++++++++++++++----
 .../Linalg/Transforms/PadTilingInterface.cpp  |   5 +-
 .../lib/Interfaces/ValueBoundsOpInterface.cpp |   2 +-
 ...m-op-pad-tiling-interface-multiple-of.mlir | 131 +++++++++++++++++
 6 files changed, 251 insertions(+), 41 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
index 6fdb72c370e6d..2091faa6b0b02 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
@@ -410,9 +410,11 @@ void canonicalizeSetAndOperands(IntegerSet *set,
 /// other AffineApplyOps supplying those operands. The operands of the resulting
 /// AffineApplyOp do not change the length of  AffineApplyOp chains.
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);
 
 /// Constructs an AffineApplyOp that applies `map` to `operands` after composing
 /// the map with the maps of any other AffineApplyOp supplying the operands,
@@ -421,16 +423,19 @@ AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
 /// map.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                            AffineMap map,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` that applies to an expression.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                            AffineExpr expr,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` suitable for multi-result maps.
 /// Note that this may create as many affine.apply operations as the map has
 /// results given that affine.apply must be single-result.
 SmallVector<OpFoldResult> makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands);
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin = false);
 
 /// Returns an AffineMinOp obtained by composing `map` and `operands` with
 /// AffineApplyOps supplying those operands.
@@ -459,7 +464,8 @@ OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc,
 /// terminal symbol, i.e., a symbol defined at the top level or a block/function
 /// argument.
 void fullyComposeAffineMapAndOperands(AffineMap *map,
-                                      SmallVectorImpl<Value> *operands);
+                                      SmallVectorImpl<Value> *operands,
+                                      bool composeAffineMin = false);
 
 } // namespace affine
 } // namespace mlir
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 337314143c80c..523df173093fa 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -135,7 +135,7 @@ class ValueBoundsConstraintSet
 
     /// Construct a variable for a map and its operands.
     Variable(AffineMap map, ArrayRef<Variable> mapOperands);
-    Variable(AffineMap map, ArrayRef<Value> mapOperands);
+    Variable(AffineMap map, ValueRange mapOperands);
 
     MLIRContext *getContext() const { return map.getContext(); }
 
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 3d09c6a9b2c24..06b7910736727 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -11,12 +11,14 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -26,7 +28,9 @@
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
+#include <limits>
 #include <numeric>
 #include <optional>
 
@@ -1042,6 +1046,59 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
                        map.getContext());
 }
 
+/// Assuming `dimOrSym` is a quantity in `map` that is defined by `minOp`,
+/// replaces the patterns:
+/// ```
+///   dimOrSym.ceildiv(cst) * cst
+///   (dimOrSym + cst - 1).floordiv(cst) * cst
+/// ```
+/// by `cst` in `map`.
+/// This simplification is valid iff `minOp` is guaranteed to be nonnegative.
+/// Additionally, allows the caller to pass `affineMinKnownToBeNonNegative` to
+/// inject static information that may not be statically discoverable.
+/// Warning: ValueBoundsConstraintSet::computeConstantBound is needed to check
+/// for the nonnegative case, if `affineMinKnownToBeNonNegative` is false.
+static LogicalResult replaceAffineMinBoundingBoxExpression(
+    AffineMinOp minOp, AffineExpr dimOrSym, AffineMap *map,
+    bool affineMinKnownToBeNonNegative = false) {
+  auto affineMinMap = minOp.getAffineMap();
+  if (!affineMinKnownToBeNonNegative) {
+    ValueRange values = minOp->getOperands();
+    for (unsigned i = 0, e = affineMinMap.getNumResults(); i < e; ++i) {
+      AffineMap row = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+      FailureOr<int64_t> lowerBound =
+          ValueBoundsConstraintSet::computeConstantBound(
+              presburger::BoundType::LB, {row, values},
+              /*stopCondition=*/nullptr,
+              /*closedUB=*/true);
+      if (failed(lowerBound) || lowerBound.value() < 0)
+        return failure();
+    }
+  }
+
+  AffineMap initialMap = *map;
+  for (unsigned i = 0, e = affineMinMap.getNumResults(); i != e; ++i) {
+    auto m = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+    // TODO: this should also work with nonnegative symbolic divisors.
+    if (!m.isSingleConstant())
+      continue;
+
+    auto cst = m.getSingleConstantResult();
+    DenseMap<AffineExpr, AffineExpr> repl;
+    // dimOrSym.ceilDiv(cst) * cst -> cst
+    repl[dimOrSym.ceilDiv(cst) * cst] =
+        getAffineConstantExpr(cst, minOp.getContext());
+    // (dimOrSym + cst - 1).floorDiv(cst) * cst -> cst
+    repl[(dimOrSym + cst - 1).floorDiv(cst) * cst] =
+        getAffineConstantExpr(cst, minOp.getContext());
+    auto newMap = map->replace(repl);
+    if (newMap == *map)
+      continue;
+    *map = newMap;
+  }
+  return success(*map != initialMap);
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1052,10 +1109,13 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
 ///   2. `map` dim and symbols are gradually shifted to higher positions.
 ///   3. Old `dim` and `sym` entries are replaced by nullptr
 /// This avoids the need for any bookkeeping.
+/// If `replaceAffineMin` is set to true, additionally triggers more expensive
+/// replacements involving affine_min operations.
 static LogicalResult replaceDimOrSym(AffineMap *map,
                                      unsigned dimOrSymbolPosition,
                                      SmallVectorImpl<Value> &dims,
-                                     SmallVectorImpl<Value> &syms) {
+                                     SmallVectorImpl<Value> &syms,
+                                     bool replaceAffineMin) {
   MLIRContext *ctx = map->getContext();
   bool isDimReplacement = (dimOrSymbolPosition < dims.size());
   unsigned pos = isDimReplacement ? dimOrSymbolPosition
@@ -1064,6 +1124,13 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
   if (!v)
     return failure();
 
+  auto minOp = v.getDefiningOp<AffineMinOp>();
+  if (minOp && replaceAffineMin) {
+    AffineExpr dimOrSym = isDimReplacement ? getAffineDimExpr(pos, ctx)
+                                           : getAffineSymbolExpr(pos, ctx);
+    return replaceAffineMinBoundingBoxExpression(minOp, dimOrSym, map);
+  }
+
   auto affineApply = v.getDefiningOp<AffineApplyOp>();
   if (!affineApply)
     return failure();
@@ -1101,7 +1168,8 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
 /// iteratively. Perform canonicalization of map and operands as well as
 /// AffineMap simplification. `map` and `operands` are mutated in place.
 static void composeAffineMapAndOperands(AffineMap *map,
-                                        SmallVectorImpl<Value> *operands) {
+                                        SmallVectorImpl<Value> *operands,
+                                        bool composeAffineMin = false) {
   if (map->getNumResults() == 0) {
     canonicalizeMapAndOperands(map, operands);
     *map = simplifyAffineMap(*map);
@@ -1122,7 +1190,8 @@ static void composeAffineMapAndOperands(AffineMap *map,
   while (true) {
     bool changed = false;
     for (unsigned pos = 0; pos != dims.size() + syms.size(); ++pos)
-      if ((changed |= succeeded(replaceDimOrSym(map, pos, dims, syms))))
+      if ((changed |=
+           succeeded(replaceDimOrSym(map, pos, dims, syms, composeAffineMin))))
         break;
     if (!changed)
       break;
@@ -1163,38 +1232,41 @@ static void composeAffineMapAndOperands(AffineMap *map,
 }
 
 void mlir::affine::fullyComposeAffineMapAndOperands(
-    AffineMap *map, SmallVectorImpl<Value> *operands) {
+    AffineMap *map, SmallVectorImpl<Value> *operands, bool composeAffineMin) {
   while (llvm::any_of(*operands, [](Value v) {
     return isa_and_nonnull<AffineApplyOp>(v.getDefiningOp());
   })) {
-    composeAffineMapAndOperands(map, operands);
+    composeAffineMapAndOperands(map, operands, composeAffineMin);
   }
 }
 
 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
   SmallVector<Value> valueOperands;
   map = foldAttributesIntoMap(b, map, operands, valueOperands);
-  composeAffineMapAndOperands(&map, &valueOperands);
+  composeAffineMapAndOperands(&map, &valueOperands, composeAffineMin);
   assert(map);
   return b.create<AffineApplyOp>(loc, map, valueOperands);
 }
 
 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
   return makeComposedAffineApply(
       b, loc,
       AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}, b.getContext())
           .front(),
-      operands);
+      operands, composeAffineMin);
 }
 
 /// Composes the given affine map with the given list of operands, pulling in
 /// the maps from any affine.apply operations that supply the operands.
 static void composeMultiResultAffineMap(AffineMap &map,
-                                        SmallVectorImpl<Value> &operands) {
+                                        SmallVectorImpl<Value> &operands,
+                                        bool composeAffineMin = false) {
   // Compose and canonicalize each expression in the map individually because
   // composition only applies to single-result maps, collecting potentially
   // duplicate operands in a single list with shifted dimensions and symbols.
@@ -1203,7 +1275,8 @@ static void composeMultiResultAffineMap(AffineMap &map,
   for (unsigned i : llvm::seq<unsigned>(0, map.getNumResults())) {
     SmallVector<Value> submapOperands(operands.begin(), operands.end());
     AffineMap submap = map.getSubMap({i});
-    fullyComposeAffineMapAndOperands(&submap, &submapOperands);
+    fullyComposeAffineMapAndOperands(&submap, &submapOperands,
+                                     composeAffineMin);
     canonicalizeMapAndOperands(&submap, &submapOperands);
     unsigned numNewDims = submap.getNumDims();
     submap = submap.shiftDims(dims.size()).shiftSymbols(symbols.size());
@@ -1221,10 +1294,9 @@ static void composeMultiResultAffineMap(AffineMap &map,
   canonicalizeMapAndOperands(&map, &operands);
 }
 
-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineMap map,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
   assert(map.getNumResults() == 1 && "building affine.apply with !=1 result");
 
   // Create new builder without a listener, so that no notification is
@@ -1236,7 +1308,7 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
 
   // Create op.
   AffineApplyOp applyOp =
-      makeComposedAffineApply(newBuilder, loc, map, operands);
+      makeComposedAffineApply(newBuilder, loc, map, operands, composeAffineMin);
 
   // Get constant operands.
   SmallVector<Attribute> constOperands(applyOp->getNumOperands());
@@ -1256,26 +1328,25 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
   return llvm::getSingleElement(foldResults);
 }
 
-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineExpr expr,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineExpr expr,
+    ArrayRef<OpFoldResult> operands, bool composeAffineMin) {
   return makeComposedFoldedAffineApply(
       b, loc,
       AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}, b.getContext())
           .front(),
-      operands);
+      operands, composeAffineMin);
 }
 
 SmallVector<OpFoldResult>
 mlir::affine::makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map,
-    ArrayRef<OpFoldResult> operands) {
-  return llvm::map_to_vector(llvm::seq<unsigned>(0, map.getNumResults()),
-                             [&](unsigned i) {
-                               return makeComposedFoldedAffineApply(
-                                   b, loc, map.getSubMap({i}), operands);
-                             });
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
+  return llvm::map_to_vector(
+      llvm::seq<unsigned>(0, map.getNumResults()), [&](unsigned i) {
+        return makeComposedFoldedAffineApply(b, loc, map.getSubMap({i}),
+                                             operands, composeAffineMin);
+      });
 }
 
 template <typename OpTy>
@@ -3024,7 +3095,8 @@ void AffineIfOp::build(OpBuilder &builder, OperationState &result,
 /// `set` by composing the maps of such affine.apply ops with the integer
 /// set constraints.
 static void composeSetAndOperands(IntegerSet &set,
-                                  SmallVectorImpl<Value> &operands) {
+                                  SmallVectorImpl<Value> &operands,
+                                  bool composeAffineMin) {
   // We will simply reuse the API of the map composition by viewing the LHSs of
   // the equalities and inequalities of `set` as the affine exprs of an affine
   // map. Convert to equivalent map, compose, and convert back to set.
@@ -3035,7 +3107,7 @@ static void composeSetAndOperands(IntegerSet &set,
                     [](Value v) { return v.getDefiningOp<AffineApplyOp>(); }))
     return;
 
-  composeAffineMapAndOperands(&map, &operands);
+  composeAffineMapAndOperands(&map, &operands, composeAffineMin);
   set = IntegerSet::get(map.getNumDims(), map.getNumSymbols(), map.getResults(),
                         set.getEqFlags());
 }
@@ -3044,7 +3116,7 @@ static void composeSetAndOperands(IntegerSet &set,
 LogicalResult AffineIfOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
   auto set = getIntegerSet();
   SmallVector<Value, 4> operands(getOperands());
-  composeSetAndOperands(set, operands);
+  composeSetAndOperands(set, operands, /*composeAffineMin=*/false);
   canonicalizeSetAndOperands(&set, &operands);
 
   // Check if the canonicalization or composition led to any change.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 5383ae48aeb3a..42dac0776bace 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -84,7 +84,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       getDimsToSize(rewriter, indexingSizes, options);
 
   // For each dimension in the operand's shape, iterate over indexingSizes and
-  // add
+  // add the various term contributions.
   for (const auto &enResults : enumerate(indexingMap.getResults())) {
     int64_t resultIndex = enResults.index();
     AffineMap partialIndexingMap = indexingMap.getSubMap(
@@ -122,7 +122,8 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
         AffineMap composedMap = projectedMap.compose(ceilMap);
         OpFoldResult paddingDimOfr = affine::makeComposedFoldedAffineApply(
             rewriter, loc, composedMap,
-            {indexingSizes[paddingDim], paddingSize});
+            {indexingSizes[paddingDim], paddingSize},
+            /*composeAffineMin=*/true);
         terms.push_back(paddingDimOfr);
       } else {
         // Otherwise just set to paddingSize.
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 87f883c2e6485..d858ab3a6406a 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -146,7 +146,7 @@ ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
 }
 
 ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
-                                             ArrayRef<Value> mapOperands)
+                                             ValueRange mapOperands)
     : Variable(map, llvm::map_to_vector(mapOperands,
                                         [](Value v) { return Variable(v); })) {}
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
index 5ac35c14be3fb..4fcbcbb2a18e3 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -136,3 +136,134 @@ module {
     }
   }
 }
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1] -> (-s1 + (s0 ceildiv 16) * 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: %[[D0_0:.*]] = tensor.dim
+  //      CHECK: %[[D0_1:.*]] = tensor.dim
+  //      CHECK: %[[H0:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]], %[[D0_1]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:   : tensor<24x?xf32> to tensor<24x?xf32>
+
+  //      CHECK: %[[D0_2:.*]] = tensor.dim
+  //      CHECK: %[[H1:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]], %[[D0_2]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:   : tensor<?x25xf32> to tensor<?x25xf32>
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+
+  // TODO: Not yet simplified enough..
+  //      CHECK:    linalg.matmul ins(%{{.*}}, %{{.*}}: tensor<8x?xf32>, tensor<?x25xf32>) outs(%extracted_slice_5 : tensor<8x25xf32>) -> tensor<8x25xf32>
+  
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Pad then tile should produce static shapes.
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul_padded tile_sizes [8, 0, 16] 
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+  //      CHECK:   %[[MIN:.*]] = affine.min #[[$MAP0]](%{{.*}})
+  //      CHECK:   %[[H0:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:     : tensor<8x?xf32> to tensor<8x16xf32>
+
+  //      CHECK:   %[[H1:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:     : tensor<?x25xf32> to tensor<16x25xf32>
+
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+  
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16] 
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 20, 8)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 8)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<20x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<20x25xf32>)
+     -> tensor<20x25xf32>
+{
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<20x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<20x25xf32>) -> tensor<20x25xf32>
+  func.return %0 : tensor<20x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16] 
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+