[Mlir-commits] [mlir] [mlir][linalg] Fix crash when folding tensor.cast into unpack using static packed shape for inner tiles (PR #188000)

Tue Mar 31 20:10:30 PDT 2026

https://github.com/hockyy updated https://github.com/llvm/llvm-project/pull/188000

>From ddf0ddfbcdecec8c66fcbaf9ef3bd2e22b6f948d Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 23 Mar 2026 17:44:17 +0800
Subject: [PATCH 1/6] [mlir][linalg] Bail out tensor.cast pack/unpack fold on
 unprovable tile sizes

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  41 +++--
 ...canonicalize-dynamic-pack-unpack-tile.mlir | 149 ++++++++++++++++++
 2 files changed, 176 insertions(+), 14 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index ad2909f656eea..5b75be21e4822 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -5000,8 +5000,10 @@ template SmallVector<int64_t>
 //  * a dim from newPackedTy is static, and
 //  * the corresponding size from mixedTiles is still dynamic.
 // Otherwise, the original tile size is preserved.
+// Returns failure when a dynamic tile cannot be proven to match the static
+// packed dim.
 // Note - packed-type-dim and mixed-tile-size should always match!
-static SmallVector<OpFoldResult>
+static FailureOr<SmallVector<OpFoldResult>>
 getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
                      SmallVector<OpFoldResult> mixedTiles) {
   SmallVector<OpFoldResult> newMixedTileSizes;
@@ -5015,17 +5017,21 @@ getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
       continue;
     }
 
-    // If the current result dim is static, update the dynamic mixed-size
-    // (provided the original value is dynamic).
+    // If the current result dim is static, update the dynamic mixed-size only
+    // when the original dynamic value is a known constant matching `shape`.
+    // Otherwise, bail out and let the fold fail conservatively.
     OpFoldResult tile = std::get<1>(it);
     if (Attribute attr = llvm::dyn_cast_if_present<Attribute>(tile)) {
       // Already a constant
       newMixedTileSizes.push_back(tile);
     } else {
-      assert(getConstantIntValue(tile).value() == shape &&
-             "tile size and dim size don't match!");
-      newMixedTileSizes.push_back(
-          (rewriter.getIntegerAttr(rewriter.getIndexType(), shape)));
+      std::optional<int64_t> constTile = getConstantIntValue(tile);
+      if (constTile.has_value() && constTile.value() == shape) {
+        newMixedTileSizes.push_back(
+            rewriter.getIntegerAttr(rewriter.getIndexType(), shape));
+      } else {
+        return failure();
+      }
     }
   }
 
@@ -5995,8 +6001,11 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
         tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes);
 
     // Get the updated mixed-tile-sizes attribute.
-    SmallVector<OpFoldResult> newMixedTileSizes =
+    FailureOr<SmallVector<OpFoldResult>> newMixedTileSizes =
         getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles());
+    if (failed(newMixedTileSizes))
+      return rewriter.notifyMatchFailure(
+          op, "unable to prove dynamic tile sizes after folding tensor.cast");
 
     // Clone op.
     // TODO: Strictly speaking, discardable attributes should be _discarded_ at
@@ -6004,7 +6013,7 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
     // to preserve. Implement a better abstraction.
     PackOp newOp =
         PackOp::create(rewriter, op.getLoc(), newOperands[0], newOperands[1],
-                       op.getInnerDimsPos(), newMixedTileSizes,
+                       op.getInnerDimsPos(), newMixedTileSizes.value(),
                        op.getPaddingValue(), op.getOuterDimsPerm());
     newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
 
@@ -6476,16 +6485,20 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
     Value sourceTensor = newOperands[0];
 
     // Get the updated mixed-tile-sizes attribute.
-    SmallVector<OpFoldResult> newMixedTileSizes = getNewMixedTileSizes(
-        rewriter, sourceTensor.getType(), op.getMixedTiles());
+    FailureOr<SmallVector<OpFoldResult>> newMixedTileSizes =
+        getNewMixedTileSizes(rewriter, sourceTensor.getType(),
+                             op.getMixedTiles());
+    if (failed(newMixedTileSizes))
+      return rewriter.notifyMatchFailure(
+          op, "unable to prove dynamic tile sizes after folding tensor.cast");
 
     // Clone op.
     // TODO: Strictly speaking, discardable attributes should be _discarded_ at
     // this point. However, in practice, we use them for things that we'd like
     // to preserve. Implement a better abstraction.
-    UnPackOp newOp = UnPackOp::create(rewriter, op.getLoc(), sourceTensor,
-                                      newOperands[1], op.getInnerDimsPos(),
-                                      newMixedTileSizes, op.getOuterDimsPerm());
+    UnPackOp newOp = UnPackOp::create(
+        rewriter, op.getLoc(), sourceTensor, newOperands[1],
+        op.getInnerDimsPos(), newMixedTileSizes.value(), op.getOuterDimsPerm());
     newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
 
     // Replace op.
diff --git a/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir b/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir
new file mode 100644
index 0000000000000..eec3e3acc93fb
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt %s --inline -canonicalize="test-convergence" -split-input-file | FileCheck %s --check-prefixes=CHECK
+
+// CHECK: func.func @dynamic_tile_arg_no_fold
+// CHECK-SAME:  %[[SRC:.+]]: tensor<1x3x8x1xi32>, %[[TILE:.+]]: index
+// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %[[SRC]] : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+// CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
+// CHECK-SAME:    inner_dims_pos = [0, 1]
+// CHECK-SAME:    inner_tiles = [%[[TILE]], 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+module {
+  func.func @dynamic_tile_arg_no_fold(%arg0: tensor<1x3x8x1xi32>, %arg1: index) -> tensor<7x3xi32> {
+    %0 = tensor.empty() : tensor<7x3xi32>
+    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%arg1, 1] into %0 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+    return %unpack : tensor<7x3xi32>
+  }
+}
+
+
+// -----
+
+// CHECK-LABEL: func.func @dynamic_tile_from_inlined_mismatch_no_fold
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %{{.+}} : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+// CHECK:       %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
+// CHECK-SAME:    inner_dims_pos = [0, 1]
+// CHECK-SAME:    inner_tiles = [%[[C256]], 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+module {
+  func.func @get_tile() -> index {
+    %c256 = arith.constant 256 : index
+    return %c256 : index
+  }
+  func.func @dynamic_tile_from_inlined_mismatch_no_fold(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
+    %0 = call @get_tile() : () -> index
+    %1 = tensor.empty() : tensor<7x3xi32>
+    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%0, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+    return %unpack : tensor<7x3xi32>
+  }
+}
+
+
+// -----
+
+// CHECK-LABEL: func.func @constant_tile_from_inlined_match_folds
+// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-NOT:   tensor.cast
+// CHECK:       %[[UNPACK:.+]] = linalg.unpack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<1x3x8x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+module {
+  func.func @get_tile() -> index {
+    %c8 = arith.constant 8 : index
+    return %c8 : index
+  }
+  func.func @constant_tile_from_inlined_match_folds(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
+    %0 = call @get_tile() : () -> index
+    %1 = tensor.empty() : tensor<7x3xi32>
+    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%0, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+    return %unpack : tensor<7x3xi32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pack_dynamic_tile_arg
+// CHECK-SAME:  %[[SRC:.+]]: tensor<8x3xi32>, %[[TILE:.+]]: index, %[[DEST:.+]]: tensor<?x3x?x1xi32>
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%[[TILE]], 1]
+// CHECK:         into %[[DEST]] : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+module {
+  func.func @pack_dynamic_tile_arg(%arg0: tensor<8x3xi32>, %arg1: index,
+      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+    %c0 = arith.constant 0 : i32
+    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+    %pack = linalg.pack %cast
+      padding_value(%c0 : i32)
+      inner_dims_pos = [0, 1]
+      inner_tiles = [%arg1, 1]
+      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+    return %pack : tensor<?x3x?x1xi32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pack_dynamic_tile_from_inlined_mismatch
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%[[C256]], 1]
+// CHECK:         into %{{.+}} : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+module {
+  func.func @pack_get_tile() -> index {
+    %c256 = arith.constant 256 : index
+    return %c256 : index
+  }
+  func.func @pack_dynamic_tile_from_inlined_mismatch(%arg0: tensor<8x3xi32>,
+      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+    %c0 = arith.constant 0 : i32
+    %0 = call @pack_get_tile() : () -> index
+    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+    %pack = linalg.pack %cast
+      padding_value(%c0 : i32)
+      inner_dims_pos = [0, 1]
+      inner_tiles = [%0, 1]
+      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+    return %pack : tensor<?x3x?x1xi32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pack_dynamic_tile_from_inlined_match_fold
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%{{.+}}, 1]
+// CHECK:         into %{{.+}} : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+module {
+  func.func @pack_get_tile() -> index {
+    %c8 = arith.constant 8 : index
+    return %c8 : index
+  }
+  func.func @pack_dynamic_tile_from_inlined_match_fold(%arg0: tensor<8x3xi32>,
+      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+    %c0 = arith.constant 0 : i32
+    %0 = call @pack_get_tile() : () -> index
+    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+    %pack = linalg.pack %cast
+      padding_value(%c0 : i32)
+      inner_dims_pos = [0, 1]
+      inner_tiles = [%0, 1]
+      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+    return %pack : tensor<?x3x?x1xi32>
+  }
+}
\ No newline at end of file

>From 88092ddeba16130b20d2f3e87e602153e204af1f Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 23 Mar 2026 18:11:14 +0800
Subject: [PATCH 2/6] Update mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp

Co-authored-by: Renato Golin <rengolin at systemcall.eu>
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 5b75be21e4822..effc0e3232b4f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -6001,7 +6001,7 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
         tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes);
 
     // Get the updated mixed-tile-sizes attribute.
-    FailureOr<SmallVector<OpFoldResult>> newMixedTileSizes =
+    auto newMixedTileSizes =
         getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles());
     if (failed(newMixedTileSizes))
       return rewriter.notifyMatchFailure(

>From c27d25d41ca28cb34bd41126bb950c4056eeb947 Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 23 Mar 2026 18:11:27 +0800
Subject: [PATCH 3/6] Update mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp

Co-authored-by: Renato Golin <rengolin at systemcall.eu>
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index effc0e3232b4f..918ba1284e043 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -6485,7 +6485,7 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
     Value sourceTensor = newOperands[0];
 
     // Get the updated mixed-tile-sizes attribute.
-    FailureOr<SmallVector<OpFoldResult>> newMixedTileSizes =
+    auto newMixedTileSizes =
         getNewMixedTileSizes(rewriter, sourceTensor.getType(),
                              op.getMixedTiles());
     if (failed(newMixedTileSizes))

>From d5c9b284eb84541edb5f631ab0d4f335a1ae8787 Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Mon, 23 Mar 2026 18:14:29 +0800
Subject: [PATCH 4/6] [mlir][linalg] Fix refactoring reviews for
 getNewMixedTileSizes

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 918ba1284e043..9d2afdbb8e6bf 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -5021,17 +5021,11 @@ getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
     // when the original dynamic value is a known constant matching `shape`.
     // Otherwise, bail out and let the fold fail conservatively.
     OpFoldResult tile = std::get<1>(it);
-    if (Attribute attr = llvm::dyn_cast_if_present<Attribute>(tile)) {
-      // Already a constant
-      newMixedTileSizes.push_back(tile);
+    std::optional<int64_t> constTile = getConstantIntValue(tile);
+    if (constTile.has_value() && constTile.value() == shape) {
+      newMixedTileSizes.push_back(rewriter.getIndexAttr(shape));
     } else {
-      std::optional<int64_t> constTile = getConstantIntValue(tile);
-      if (constTile.has_value() && constTile.value() == shape) {
-        newMixedTileSizes.push_back(
-            rewriter.getIntegerAttr(rewriter.getIndexType(), shape));
-      } else {
-        return failure();
-      }
+      return failure();
     }
   }
 
@@ -6485,9 +6479,8 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
     Value sourceTensor = newOperands[0];
 
     // Get the updated mixed-tile-sizes attribute.
-    auto newMixedTileSizes =
-        getNewMixedTileSizes(rewriter, sourceTensor.getType(),
-                             op.getMixedTiles());
+    auto newMixedTileSizes = getNewMixedTileSizes(
+        rewriter, sourceTensor.getType(), op.getMixedTiles());
     if (failed(newMixedTileSizes))
       return rewriter.notifyMatchFailure(
           op, "unable to prove dynamic tile sizes after folding tensor.cast");

>From cc2065c38e3431f96eb8c83d6c62fb8b3ee025d7 Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Thu, 26 Mar 2026 09:54:13 +0800
Subject: [PATCH 5/6] [mlir][linalg] Refactor test cases

---
 ...canonicalize-dynamic-pack-unpack-tile.mlir | 149 ------------------
 mlir/test/Dialect/Linalg/canonicalize.mlir    | 118 ++++++++++++++
 2 files changed, 118 insertions(+), 149 deletions(-)
 delete mode 100644 mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir

diff --git a/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir b/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir
deleted file mode 100644
index eec3e3acc93fb..0000000000000
--- a/mlir/test/Dialect/Linalg/canonicalize-dynamic-pack-unpack-tile.mlir
+++ /dev/null
@@ -1,149 +0,0 @@
-// RUN: mlir-opt %s --inline -canonicalize="test-convergence" -split-input-file | FileCheck %s --check-prefixes=CHECK
-
-// CHECK: func.func @dynamic_tile_arg_no_fold
-// CHECK-SAME:  %[[SRC:.+]]: tensor<1x3x8x1xi32>, %[[TILE:.+]]: index
-// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
-// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %[[SRC]] : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-// CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
-// CHECK-SAME:    inner_dims_pos = [0, 1]
-// CHECK-SAME:    inner_tiles = [%[[TILE]], 1]
-// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-module {
-  func.func @dynamic_tile_arg_no_fold(%arg0: tensor<1x3x8x1xi32>, %arg1: index) -> tensor<7x3xi32> {
-    %0 = tensor.empty() : tensor<7x3xi32>
-    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%arg1, 1] into %0 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-    return %unpack : tensor<7x3xi32>
-  }
-}
-
-
-// -----
-
-// CHECK-LABEL: func.func @dynamic_tile_from_inlined_mismatch_no_fold
-// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
-// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
-// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %{{.+}} : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-// CHECK:       %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
-// CHECK-SAME:    inner_dims_pos = [0, 1]
-// CHECK-SAME:    inner_tiles = [%[[C256]], 1]
-// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-module {
-  func.func @get_tile() -> index {
-    %c256 = arith.constant 256 : index
-    return %c256 : index
-  }
-  func.func @dynamic_tile_from_inlined_mismatch_no_fold(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
-    %0 = call @get_tile() : () -> index
-    %1 = tensor.empty() : tensor<7x3xi32>
-    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%0, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-    return %unpack : tensor<7x3xi32>
-  }
-}
-
-
-// -----
-
-// CHECK-LABEL: func.func @constant_tile_from_inlined_match_folds
-// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
-// CHECK-NOT:   tensor.cast
-// CHECK:       %[[UNPACK:.+]] = linalg.unpack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 1]
-// CHECK-SAME:    into %[[EMPTY]] : tensor<1x3x8x1xi32> -> tensor<7x3xi32>
-// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-module {
-  func.func @get_tile() -> index {
-    %c8 = arith.constant 8 : index
-    return %c8 : index
-  }
-  func.func @constant_tile_from_inlined_match_folds(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
-    %0 = call @get_tile() : () -> index
-    %1 = tensor.empty() : tensor<7x3xi32>
-    %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-    %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%0, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-    return %unpack : tensor<7x3xi32>
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func.func @pack_dynamic_tile_arg
-// CHECK-SAME:  %[[SRC:.+]]: tensor<8x3xi32>, %[[TILE:.+]]: index, %[[DEST:.+]]: tensor<?x3x?x1xi32>
-// CHECK:       %[[PACK:.+]] = linalg.pack
-// CHECK:         padding_value
-// CHECK:         inner_dims_pos = [0, 1]
-// CHECK:         inner_tiles = [%[[TILE]], 1]
-// CHECK:         into %[[DEST]] : tensor
-// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
-module {
-  func.func @pack_dynamic_tile_arg(%arg0: tensor<8x3xi32>, %arg1: index,
-      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
-    %c0 = arith.constant 0 : i32
-    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
-    %pack = linalg.pack %cast
-      padding_value(%c0 : i32)
-      inner_dims_pos = [0, 1]
-      inner_tiles = [%arg1, 1]
-      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
-    return %pack : tensor<?x3x?x1xi32>
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func.func @pack_dynamic_tile_from_inlined_mismatch
-// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
-// CHECK:       %[[PACK:.+]] = linalg.pack
-// CHECK:         padding_value
-// CHECK:         inner_dims_pos = [0, 1]
-// CHECK:         inner_tiles = [%[[C256]], 1]
-// CHECK:         into %{{.+}} : tensor
-// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
-module {
-  func.func @pack_get_tile() -> index {
-    %c256 = arith.constant 256 : index
-    return %c256 : index
-  }
-  func.func @pack_dynamic_tile_from_inlined_mismatch(%arg0: tensor<8x3xi32>,
-      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
-    %c0 = arith.constant 0 : i32
-    %0 = call @pack_get_tile() : () -> index
-    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
-    %pack = linalg.pack %cast
-      padding_value(%c0 : i32)
-      inner_dims_pos = [0, 1]
-      inner_tiles = [%0, 1]
-      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
-    return %pack : tensor<?x3x?x1xi32>
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func.func @pack_dynamic_tile_from_inlined_match_fold
-// CHECK:       %[[PACK:.+]] = linalg.pack
-// CHECK:         padding_value
-// CHECK:         inner_dims_pos = [0, 1]
-// CHECK:         inner_tiles = [%{{.+}}, 1]
-// CHECK:         into %{{.+}} : tensor
-// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
-module {
-  func.func @pack_get_tile() -> index {
-    %c8 = arith.constant 8 : index
-    return %c8 : index
-  }
-  func.func @pack_dynamic_tile_from_inlined_match_fold(%arg0: tensor<8x3xi32>,
-      %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
-    %c0 = arith.constant 0 : i32
-    %0 = call @pack_get_tile() : () -> index
-    %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
-    %pack = linalg.pack %cast
-      padding_value(%c0 : i32)
-      inner_dims_pos = [0, 1]
-      inner_tiles = [%0, 1]
-      into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
-    return %pack : tensor<?x3x?x1xi32>
-  }
-}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 77c1c3da17166..f0949fb6b1839 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -2160,3 +2160,121 @@ func.func @unpack_pack_memref_no_canonicalization(%packed: memref<16x8x8x32xf32>
   linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %dest : memref<128x256xf32> -> memref<16x8x8x32xf32>
   return
 }
+
+// -----
+// CHECK-LABEL: func.func @no_fold_unpack_cast_inner_tile_dynamic_arg
+// CHECK-SAME:  %[[SRC:.+]]: tensor<1x3x8x1xi32>, %[[TILE:.+]]: index
+// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %[[SRC]] : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+// CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
+// CHECK-SAME:    inner_dims_pos = [0, 1]
+// CHECK-SAME:    inner_tiles = [%[[TILE]], 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+func.func @no_fold_unpack_cast_inner_tile_dynamic_arg(%arg0: tensor<1x3x8x1xi32>, %arg1: index) -> tensor<7x3xi32> {
+  %0 = tensor.empty() : tensor<7x3xi32>
+  %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%arg1, 1] into %0 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+  return %unpack : tensor<7x3xi32>
+}
+
+
+// -----
+// CHECK-LABEL: func.func @no_fold_unpack_cast_inner_tile_inlined_mismatch
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %{{.+}} : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+// CHECK:       %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
+// CHECK-SAME:    inner_dims_pos = [0, 1]
+// CHECK-SAME:    inner_tiles = [%[[C256]], 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+func.func @no_fold_unpack_cast_inner_tile_inlined_mismatch(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
+  %c256 = arith.constant 256 : index
+  %1 = tensor.empty() : tensor<7x3xi32>
+  %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c256, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+  return %unpack : tensor<7x3xi32>
+}
+// -----
+
+// CHECK-LABEL: func.func @unpack_cast_inner_tile_inlined_match_fold
+// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK-NOT:   tensor.cast
+// CHECK:       %[[UNPACK:.+]] = linalg.unpack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<1x3x8x1xi32> -> tensor<7x3xi32>
+// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
+func.func @unpack_cast_inner_tile_inlined_match_fold(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
+  %c8 = arith.constant 8 : index
+  %1 = tensor.empty() : tensor<7x3xi32>
+  %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
+  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+  return %unpack : tensor<7x3xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @no_fold_pack_cast_inner_tile_dynamic_arg
+// CHECK-SAME:  %[[SRC:.+]]: tensor<8x3xi32>, %[[TILE:.+]]: index, %[[DEST:.+]]: tensor<?x3x?x1xi32>
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%[[TILE]], 1]
+// CHECK:         into %[[DEST]] : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+func.func @no_fold_pack_cast_inner_tile_dynamic_arg(%arg0: tensor<8x3xi32>, %arg1: index,
+    %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+  %c0 = arith.constant 0 : i32
+  %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+  %pack = linalg.pack %cast
+    padding_value(%c0 : i32)
+    inner_dims_pos = [0, 1]
+    inner_tiles = [%arg1, 1]
+    into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+  return %pack : tensor<?x3x?x1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @no_fold_pack_cast_inner_tile_inlined_mismatch
+// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%[[C256]], 1]
+// CHECK:         into %{{.+}} : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+func.func @no_fold_pack_cast_inner_tile_inlined_mismatch(%arg0: tensor<8x3xi32>,
+    %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+  %c0 = arith.constant 0 : i32
+  %c256 = arith.constant 256 : index
+  %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+  %pack = linalg.pack %cast
+    padding_value(%c0 : i32)
+    inner_dims_pos = [0, 1]
+    inner_tiles = [%c256, 1]
+    into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+  return %pack : tensor<?x3x?x1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pack_cast_inner_tile_inlined_match_fold
+// CHECK:       %[[PACK:.+]] = linalg.pack
+// CHECK:         padding_value
+// CHECK:         inner_dims_pos = [0, 1]
+// CHECK:         inner_tiles = [%{{.+}}, 1]
+// CHECK:         into %{{.+}} : tensor
+// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
+func.func @pack_cast_inner_tile_inlined_match_fold(%arg0: tensor<8x3xi32>,
+    %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
+  %c0 = arith.constant 0 : i32
+  %c8 = arith.constant 8 : index
+  %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
+  %pack = linalg.pack %cast
+    padding_value(%c0 : i32)
+    inner_dims_pos = [0, 1]
+    inner_tiles = [%c8, 1]
+    into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
+  return %pack : tensor<?x3x?x1xi32>
+}

>From 4fdd31f8b140efbd789989460583812894fa044e Mon Sep 17 00:00:00 2001
From: Hocky Yudhiono <hocky.yudhiono at gmail.com>
Date: Thu, 26 Mar 2026 10:19:56 +0800
Subject: [PATCH 6/6] [mlir][linalg] Fix testcases

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp   | 41 ++++----------
 mlir/test/Dialect/Linalg/canonicalize.mlir | 63 +++++-----------------
 2 files changed, 23 insertions(+), 81 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 9d2afdbb8e6bf..cbf1fa5a32502 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -4995,17 +4995,14 @@ template SmallVector<int64_t>
     getPackedOuterShapeWithoutTransposition<UnPackOp>(UnPackOp);
 
 // Given the (potentially) updated packed type, `newPackedTy`, generates an
-// updated mixed-tile-sizes attribute. A tile size is updated only
-// when:
-//  * a dim from newPackedTy is static, and
-//  * the corresponding size from mixedTiles is still dynamic.
-// Otherwise, the original tile size is preserved.
-// Returns failure when a dynamic tile cannot be proven to match the static
-// packed dim.
+// updated mixed-tile-sizes list. For each inner packed dimension that is static
+// in `newPackedTy`, the tile is set to that static size (replacing SSA values
+// or mismatched constants). Dynamic packed dimensions preserve the original
+// tile. The folded tensor type is treated as authoritative for static extents.
 // Note - packed-type-dim and mixed-tile-size should always match!
-static FailureOr<SmallVector<OpFoldResult>>
+static SmallVector<OpFoldResult>
 getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
-                     SmallVector<OpFoldResult> mixedTiles) {
+                     ArrayRef<OpFoldResult> mixedTiles) {
   SmallVector<OpFoldResult> newMixedTileSizes;
   for (auto it : llvm::zip(cast<ShapedType>(newPackedTy)
                                .getShape()
@@ -5016,17 +5013,7 @@ getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
       newMixedTileSizes.push_back(std::get<1>(it));
       continue;
     }
-
-    // If the current result dim is static, update the dynamic mixed-size only
-    // when the original dynamic value is a known constant matching `shape`.
-    // Otherwise, bail out and let the fold fail conservatively.
-    OpFoldResult tile = std::get<1>(it);
-    std::optional<int64_t> constTile = getConstantIntValue(tile);
-    if (constTile.has_value() && constTile.value() == shape) {
-      newMixedTileSizes.push_back(rewriter.getIndexAttr(shape));
-    } else {
-      return failure();
-    }
+    newMixedTileSizes.push_back(rewriter.getIndexAttr(shape));
   }
 
   return newMixedTileSizes;
@@ -5995,11 +5982,8 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
         tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes);
 
     // Get the updated mixed-tile-sizes attribute.
-    auto newMixedTileSizes =
+    SmallVector<OpFoldResult> newMixedTileSizes =
         getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles());
-    if (failed(newMixedTileSizes))
-      return rewriter.notifyMatchFailure(
-          op, "unable to prove dynamic tile sizes after folding tensor.cast");
 
     // Clone op.
     // TODO: Strictly speaking, discardable attributes should be _discarded_ at
@@ -6007,7 +5991,7 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
     // to preserve. Implement a better abstraction.
     PackOp newOp =
         PackOp::create(rewriter, op.getLoc(), newOperands[0], newOperands[1],
-                       op.getInnerDimsPos(), newMixedTileSizes.value(),
+                       op.getInnerDimsPos(), newMixedTileSizes,
                        op.getPaddingValue(), op.getOuterDimsPerm());
     newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
 
@@ -6479,11 +6463,8 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
     Value sourceTensor = newOperands[0];
 
     // Get the updated mixed-tile-sizes attribute.
-    auto newMixedTileSizes = getNewMixedTileSizes(
+    SmallVector<OpFoldResult> newMixedTileSizes = getNewMixedTileSizes(
         rewriter, sourceTensor.getType(), op.getMixedTiles());
-    if (failed(newMixedTileSizes))
-      return rewriter.notifyMatchFailure(
-          op, "unable to prove dynamic tile sizes after folding tensor.cast");
 
     // Clone op.
     // TODO: Strictly speaking, discardable attributes should be _discarded_ at
@@ -6491,7 +6472,7 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
     // to preserve. Implement a better abstraction.
     UnPackOp newOp = UnPackOp::create(
         rewriter, op.getLoc(), sourceTensor, newOperands[1],
-        op.getInnerDimsPos(), newMixedTileSizes.value(), op.getOuterDimsPerm());
+        op.getInnerDimsPos(), newMixedTileSizes, op.getOuterDimsPerm());
     newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
 
     // Replace op.
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index f0949fb6b1839..285bdc21fbd1a 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -2162,16 +2162,15 @@ func.func @unpack_pack_memref_no_canonicalization(%packed: memref<16x8x8x32xf32>
 }
 
 // -----
-// CHECK-LABEL: func.func @no_fold_unpack_cast_inner_tile_dynamic_arg
+// CHECK-LABEL: func.func @fold_unpack_cast_inner_tile_dynamic_arg
 // CHECK-SAME:  %[[SRC:.+]]: tensor<1x3x8x1xi32>, %[[TILE:.+]]: index
-// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
-// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %[[SRC]] : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-// CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
+// CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
+// CHECK:       %[[UNPACK:.+]] = linalg.unpack %[[SRC]]
 // CHECK-SAME:    inner_dims_pos = [0, 1]
-// CHECK-SAME:    inner_tiles = [%[[TILE]], 1]
-// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+// CHECK-SAME:    inner_tiles = [8, 1]
+// CHECK-SAME:    into %[[EMPTY]] : tensor<1x3x8x1xi32> -> tensor<7x3xi32>
 // CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-func.func @no_fold_unpack_cast_inner_tile_dynamic_arg(%arg0: tensor<1x3x8x1xi32>, %arg1: index) -> tensor<7x3xi32> {
+func.func @fold_unpack_cast_inner_tile_dynamic_arg(%arg0: tensor<1x3x8x1xi32>, %arg1: index) -> tensor<7x3xi32> {
   %0 = tensor.empty() : tensor<7x3xi32>
   %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
   %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%arg1, 1] into %0 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
@@ -2180,35 +2179,19 @@ func.func @no_fold_unpack_cast_inner_tile_dynamic_arg(%arg0: tensor<1x3x8x1xi32>
 
 
 // -----
-// CHECK-LABEL: func.func @no_fold_unpack_cast_inner_tile_inlined_mismatch
-// CHECK-DAG:   %[[C256:.+]] = arith.constant 256 : index
-// CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
-// CHECK-DAG:   %[[CAST:.+]] = tensor.cast %{{.+}} : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-// CHECK:       %[[UNPACK:.+]] = linalg.unpack %[[CAST]]
-// CHECK-SAME:    inner_dims_pos = [0, 1]
-// CHECK-SAME:    inner_tiles = [%[[C256]], 1]
-// CHECK-SAME:    into %[[EMPTY]] : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-// CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-func.func @no_fold_unpack_cast_inner_tile_inlined_mismatch(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
-  %c256 = arith.constant 256 : index
-  %1 = tensor.empty() : tensor<7x3xi32>
-  %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c256, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
-  return %unpack : tensor<7x3xi32>
-}
-// -----
-
-// CHECK-LABEL: func.func @unpack_cast_inner_tile_inlined_match_fold
+// Mismatched constant tile vs static packed shape: fold still drops the cast and
+// takes inner tile sizes from the refined packed type.
+// CHECK-LABEL: func.func @fold_unpack_cast_inner_tile_inlined_mismatch
 // CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<7x3xi32>
 // CHECK-NOT:   tensor.cast
 // CHECK:       %[[UNPACK:.+]] = linalg.unpack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 1]
 // CHECK-SAME:    into %[[EMPTY]] : tensor<1x3x8x1xi32> -> tensor<7x3xi32>
 // CHECK:       return %[[UNPACK]] : tensor<7x3xi32>
-func.func @unpack_cast_inner_tile_inlined_match_fold(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
-  %c8 = arith.constant 8 : index
+func.func @fold_unpack_cast_inner_tile_inlined_mismatch(%arg0: tensor<1x3x8x1xi32>) -> tensor<7x3xi32> {
+  %c256 = arith.constant 256 : index
   %1 = tensor.empty() : tensor<7x3xi32>
   %cast = tensor.cast %arg0 : tensor<1x3x8x1xi32> to tensor<?x3x?x1xi32>
-  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
+  %unpack = linalg.unpack %cast inner_dims_pos = [0, 1] inner_tiles = [%c256, 1] into %1 : tensor<?x3x?x1xi32> -> tensor<7x3xi32>
   return %unpack : tensor<7x3xi32>
 }
 
@@ -2256,25 +2239,3 @@ func.func @no_fold_pack_cast_inner_tile_inlined_mismatch(%arg0: tensor<8x3xi32>,
     into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
   return %pack : tensor<?x3x?x1xi32>
 }
-
-// -----
-
-// CHECK-LABEL: func.func @pack_cast_inner_tile_inlined_match_fold
-// CHECK:       %[[PACK:.+]] = linalg.pack
-// CHECK:         padding_value
-// CHECK:         inner_dims_pos = [0, 1]
-// CHECK:         inner_tiles = [%{{.+}}, 1]
-// CHECK:         into %{{.+}} : tensor
-// CHECK:       return %[[PACK]] : tensor<?x3x?x1xi32>
-func.func @pack_cast_inner_tile_inlined_match_fold(%arg0: tensor<8x3xi32>,
-    %dest: tensor<?x3x?x1xi32>) -> tensor<?x3x?x1xi32> {
-  %c0 = arith.constant 0 : i32
-  %c8 = arith.constant 8 : index
-  %cast = tensor.cast %arg0 : tensor<8x3xi32> to tensor<?x?xi32>
-  %pack = linalg.pack %cast
-    padding_value(%c0 : i32)
-    inner_dims_pos = [0, 1]
-    inner_tiles = [%c8, 1]
-    into %dest : tensor<?x?xi32> -> tensor<?x3x?x1xi32>
-  return %pack : tensor<?x3x?x1xi32>
-}