[Mlir-commits] [mlir] [mlir][linalg] Enable scalable vectorization of linalg.unpack (PR #149293)

Fri Aug 1 06:49:16 PDT 2025

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/149293

>From 1992e9ae230e32d4c981676e1237d836b57abafe Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 16 Jul 2025 17:08:55 +0000
Subject: [PATCH 01/12] [mlir][linalg] Enable scalable vectorization of
 linalg.unpack (WIP)

This patch updates `vectorizeAsTensorUnpackOp` to support scalable
vectorization by requiring user-specified vector sizes for both the
_read_ and _write_ operations involved in `linalg.unpack`. Detailed
rationale and an example are provided below.

Conceptually, `linalg.unpack` consists of the following high-level steps:
  1. _Read_ from the source tensor.
  2. Transpose the value read in step (1).
  3. _Write_ the value from step (2) into the destination tensor.

Currently, when vectorizing with user-provided vector sizes, only the
sizes for the _write_ operation (step 3) are required. Sizes for the
_read_ operation (step 1) are inferred from static shapes and inner tile
sizes.

This logic breaks when the input shapes or tile sizes are dynamic
(indeed, `vectorizeUnPackOpPrecondition` rejects such cases ATM and the
vectorization fails). This patch addresses the issue by requiring
explicit vector sizes for both the read and write sides, enabling
scalable vectorization in such cases.

Example:

```mlir
func.func @unpack(%in: tensor<1x1x8x?xf32>, %out: tensor<8x?xf32>) -> tensor<8x?xf32> {
  %vs = vector.vscale
  %c8 = arith.constant 8 : index
  %tile_size = arith.muli %vs, %c8 : index

  %unpack = linalg.unpack  %in
    inner_dims_pos = [0, 1]
    inner_tiles = [8, %tile_size]
    into %out : tensor<1x1x8x?xf32> -> tensor<8x?xf32>
  return %unpack : tensor<8x?xf32>
}

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
    transform.structured.vectorize %0 vector_sizes [1, 1, 8, [8],  8, [8]] : !transform.any_op
    //                                              \         /    \    /
    //                                              read-sizes   write-sizes
    transform.yield
  }
}
```

Finally, this patch also extends `createReadOrMaskedRead` and
`createWriteOrMaskedWrite` to take scalable flags.
---
 .../mlir/Dialect/Vector/Utils/VectorUtils.h   |   2 +-
 .../Linalg/Transforms/Vectorization.cpp       | 113 +++++++++++++-----
 mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp |  22 ++--
 .../Linalg/vectorization/linalg-ops.mlir      |  98 +++++++++++++--
 4 files changed, 182 insertions(+), 53 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 7cd70e42d363c..8bd54cf31b893 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -228,7 +228,7 @@ bool isLinearizableVector(VectorType type);
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> inputVectorSizes, Value padValue,
                              bool useInBoundsInsteadOfMasking = false,
-                             ArrayRef<bool> scalableDims = {});
+                             ArrayRef<bool> inputScalableVecDims = {});
 
 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0860ceafa0270..17e0a728e7d03 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1805,7 +1805,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
     inputShape[innerDimsPos[idx]] *= size;
   auto maskedRead = vector::createReadOrMaskedRead(
       rewriter, loc, packOp.getSource(), inputShape, padValue,
-      useInBoundsInsteadOfMasking);
+      useInBoundsInsteadOfMasking,
+      /*inputScalableVecSizes=*/{});
 
   // Create ShapeCastOp.
   SmallVector<int64_t> destShape(inputVectorSizes);
@@ -1885,11 +1886,19 @@ static VectorType getCollapsedVecType(VectorType type,
 ///   vector::TransferWriteOp. - Write the result vector back to the destination
 ///   tensor.
 ///   If the vector sizes are not provided:
-///   * the vector sizes are determined by the input operand and attributes,
-///   * update the inBounds attribute instead of masking.
+/// Vectorize `linalg.unpack %src into %dest` as:
+///   // Reads a vector from the source tensor
+///   %read = vector.transfer_read  %src
+///   // Transpose %read as specified in `outer_dims_perm` attribute
+///   %tr = vector.transpose %read
+///   // Reshape the data based on the target
+///   %sc = vector.shape_cast %tr
+///   // Write the result vector to the destination tensor.
+///   vector.transfer_write %sc into %dest
 static LogicalResult
 vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                           ArrayRef<int64_t> inputVectorSizes,
+                          ArrayRef<bool> inputScalableVecDims,
                           SmallVectorImpl<Value> &newResults) {
 
   // TODO: Introduce a parent class that will handle the insertion point update.
@@ -1906,25 +1915,54 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
 
   auto destSize = unpackOp.getDestRank();
 
-  if (!inputVectorSizes.empty())
-    assert(inputVectorSizes.size() == destSize &&
+  if (!inputVectorSizes.empty()) {
+    assert(inputVectorSizes.size() == destSize + sourceShape.size() &&
            "Incorrect number of input vector sizes");
+  }
+
+  SmallVector<bool> readScalableVectorFlags;
+  SmallVector<bool> writeScalableVectorFlags;
+  SmallVector<int64_t> readVectorSizes;
+  SmallVector<int64_t> writeVectorSizes;
+
+  // Split input-vector-sizes into vector sizes for the read and write
+  // operations.
+  if (!inputVectorSizes.empty()) {
+    readVectorSizes.append(inputVectorSizes.begin(),
+                           inputVectorSizes.begin() + sourceShape.size());
+    writeVectorSizes.append(inputVectorSizes.begin() + sourceShape.size(),
+                            inputVectorSizes.end());
+  }
+  if (!inputScalableVecDims.empty()) {
+    readScalableVectorFlags.append(inputScalableVecDims.begin(),
+                                   inputScalableVecDims.begin() +
+                                       sourceShape.size());
+    writeScalableVectorFlags.append(inputScalableVecDims.begin() +
+                                        sourceShape.size(),
+                                    inputScalableVecDims.end());
+  } else {
+    readScalableVectorFlags = SmallVector<bool>(sourceShape.size(), false);
+    writeScalableVectorFlags = SmallVector<bool>(destSize, false);
+  }
 
-  // vectorSizes is the shape of the vector that will be used to do final
+  // writeVectorSizes is the shape of the vector that will be used to do final
   // write on the destination tensor. It is set like this: Let's say the
   // source tensor is rank 'M' and the dest tensor rank 'N', where N <= M.
   // Thus:
-  // 1. vectorSizes = sourceShape.take_front(N)
-  // 2. if outer_dims_perms is present: do that permutation on vectorSizes.
+  // 1. writeVectorSizes = sourceShape.take_front(N)
+  // 2. if outer_dims_perms is present: do that permutation on writeVectorSizes.
   // 3. multiply all the locations in vectorSize pointed by innerDimPos by the
   //    innerTiles attribute value.
-  SmallVector<int64_t> vectorSizes(inputVectorSizes);
-  if (vectorSizes.empty()) {
-    llvm::append_range(vectorSizes, sourceShape.take_front(destSize));
+  // SmallVector<int64_t> writeVectorSizes(inputVectorSizes);
+  if (writeVectorSizes.empty()) {
+    if (ShapedType::isDynamicShape(sourceShape))
+      return failure();
+
+    llvm::append_range(writeVectorSizes, sourceShape.take_front(destSize));
     if (!outerDimsPerm.empty())
-      applyPermutationToVector(vectorSizes, outerDimsPerm);
+      applyPermutationToVector(writeVectorSizes, outerDimsPerm);
     for (auto [i, pos] : llvm::enumerate(innerDimPos))
-      vectorSizes[pos] *= innerTiles[i];
+      writeVectorSizes[pos] *= innerTiles[i];
 
     useInBoundsInsteadOfMasking = true;
   }
@@ -1948,17 +1986,20 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   //   After applying outer_dims_perm: [8, 16]
   //   After appending the rest of the sourceShape: [8, 16, 32, 16]
 
-  SmallVector<int64_t> readVectorSizes(vectorSizes.begin(), vectorSizes.end());
-
-  for (auto [index, size] : enumerate(innerTiles)) {
-    readVectorSizes[innerDimPos[index]] =
-        llvm::divideCeil(readVectorSizes[innerDimPos[index]], size);
-  }
-  if (!outerDimsPerm.empty()) {
-    applyPermutationToVector(readVectorSizes, outerDimsPerm);
+  if (readVectorSizes.empty()) {
+    // Compute read-vector-sizes based on the write-vector-sizes and inner tile
+    // sizes. Note, this will only work when all sizes are static.
+    readVectorSizes = writeVectorSizes;
+    for (auto [index, size] : enumerate(innerTiles)) {
+      readVectorSizes[innerDimPos[index]] =
+          llvm::divideCeil(readVectorSizes[innerDimPos[index]], size);
+    }
+    if (!outerDimsPerm.empty()) {
+      applyPermutationToVector(readVectorSizes, outerDimsPerm);
+    }
+    readVectorSizes.append(sourceShape.begin() + writeVectorSizes.size(),
+                           sourceShape.end());
   }
-  readVectorSizes.append(sourceShape.begin() + vectorSizes.size(),
-                         sourceShape.end());
 
   Location loc = unpackOp->getLoc();
 
@@ -1970,7 +2011,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   // to shape of source, then a mask is necessary.
   Value readResult = vector::createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue,
-      /*useInBoundsInsteadOfMasking=*/false);
+      /*useInBoundsInsteadOfMasking=*/false, readScalableVectorFlags);
 
   PackingMetadata packMetadata;
   SmallVector<int64_t> lastDimToInsertPosPerm =
@@ -2016,7 +2057,7 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
   assert(succeeded(status) && "failed to reify result shapes");
   auto maskedRead = vector::createReadOrMaskedRead(
       rewriter, loc, padOp.getSource(), inputVectorSizes, padValue,
-      /*useInBoundsInsteadOfMasking=*/false);
+      /*useInBoundsInsteadOfMasking=*/false, /*inputScalableVecSizes=*/{});
 
   // Create Xfer write Op
   Value dest = tensor::EmptyOp::create(rewriter, loc, reifiedReturnShapes[0],
@@ -2100,6 +2141,9 @@ static LogicalResult
 vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
                               ArrayRef<int64_t> inputVectorSizes) {
 
+  // FIXME!!!
+  return success();
+
   if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) {
         return !getConstantIntValue(res).has_value();
       })) {
@@ -2436,6 +2480,7 @@ vectorizePackOpPrecondition(linalg::PackOp packOp,
     LDBG() << "pad value is not constant: " << packOp;
     return failure();
   }
+
   ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
   bool satisfyEmptyCond = true;
   if (inputVectorSizes.empty()) {
@@ -2514,12 +2559,14 @@ vectorizeScalableVectorPrecondition(Operation *op,
   if (numOfScalableDims == 0)
     return success();
 
+  // TODO: Check the following!
   auto linalgOp = dyn_cast<LinalgOp>(op);
 
-  // Cond 1: There's been no need for scalable vectorisation of
-  // non-linalg Ops so far
-  if (!linalgOp)
-    return failure();
+  // Cond 1: Reject Ops that don't implement the LinalgOp interface, with the
+  // exception of UnpackOp for which there is a dedicated hook.
+  if (!linalgOp) {
+    return isa<linalg::UnPackOp>(op) ? success() : failure();
+  }
 
   // Cond 2: There's been no need for more than 2 scalable dims so far
   if (numOfScalableDims > 2)
@@ -2617,7 +2664,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
                  isa<linalg::MatmulTransposeAOp>(op) ||
                  isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
                  isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) ||
-                 hasReductionIterator(linalgOp));
+                 isa<linalg::UnPackOp>(op) || hasReductionIterator(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
@@ -2750,7 +2797,8 @@ FailureOr<VectorizationResult> mlir::linalg::vectorize(
           })
           .Case<linalg::UnPackOp>([&](auto unpackOp) {
             return vectorizeAsTensorUnpackOp(rewriter, unpackOp,
-                                             inputVectorSizes, results);
+                                             inputVectorSizes,
+                                             inputScalableVecDims, results);
           })
           .Case<tensor::InsertSliceOp>([&](auto sliceOp) {
             return vectorizeAsInsertSliceOp(rewriter, sliceOp, inputVectorSizes,
@@ -3142,7 +3190,8 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp,
       vecType.getRank(), arith::ConstantIndexOp::create(rewriter, loc, 0));
   Value read = mlir::vector::createReadOrMaskedRead(
       rewriter, loc, source, vecType.getShape(), padValue,
-      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty());
+      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty(),
+      /*inputScalableVecSizes=*/{});
 
   // Create write
   auto writeIndices =
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 10ed2bcfb35a3..34b1bdbd9e010 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -279,14 +279,16 @@ vector::createUnrollIterator(VectorType vType, int64_t targetRank) {
   // Attempt to unroll until targetRank or the first scalable dimension (which
   // cannot be unrolled).
   auto shapeToUnroll = vType.getShape().drop_back(targetRank);
-  auto scalableDimsToUnroll = vType.getScalableDims().drop_back(targetRank);
-  auto it = llvm::find(scalableDimsToUnroll, true);
-  auto firstScalableDim = it - scalableDimsToUnroll.begin();
+  auto inputScalableVecDimsToUnroll =
+      vType.getScalableDims().drop_back(targetRank);
+  auto it = llvm::find(inputScalableVecDimsToUnroll, true);
+  auto firstScalableDim = it - inputScalableVecDimsToUnroll.begin();
   if (firstScalableDim == 0)
     return {};
   // All scalable dimensions should be removed now.
-  scalableDimsToUnroll = scalableDimsToUnroll.slice(0, firstScalableDim);
-  assert(!llvm::is_contained(scalableDimsToUnroll, true) &&
+  inputScalableVecDimsToUnroll =
+      inputScalableVecDimsToUnroll.slice(0, firstScalableDim);
+  assert(!llvm::is_contained(inputScalableVecDimsToUnroll, true) &&
          "unexpected leading scalable dimension");
   // Create an unroll iterator for leading dimensions.
   shapeToUnroll = shapeToUnroll.slice(0, firstScalableDim);
@@ -319,15 +321,15 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
                                      ArrayRef<int64_t> inputVectorSizes,
                                      Value padValue,
                                      bool useInBoundsInsteadOfMasking,
-                                     ArrayRef<bool> scalableDims) {
+                                     ArrayRef<bool> inputScalableVecDims) {
   assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) &&
          "invalid input vector sizes");
   auto sourceShapedType = cast<ShapedType>(source.getType());
   auto sourceShape = sourceShapedType.getShape();
   assert(sourceShape.size() == inputVectorSizes.size() &&
          "expected same ranks.");
-  auto vectorType =
-      VectorType::get(inputVectorSizes, padValue.getType(), scalableDims);
+  auto vectorType = VectorType::get(inputVectorSizes, padValue.getType(),
+                                    inputScalableVecDims);
   assert(padValue.getType() == sourceShapedType.getElementType() &&
          "expected same pad element type to match source element type");
   int64_t readRank = inputVectorSizes.size();
@@ -356,8 +358,8 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
           ? memref::getMixedSizes(builder, loc, source)
           : tensor::getMixedSizes(builder, loc, source);
 
-  auto maskType =
-      VectorType::get(inputVectorSizes, builder.getI1Type(), scalableDims);
+  auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type(),
+                                  inputScalableVecDims);
   Value mask =
       vector::CreateMaskOp::create(builder, loc, maskType, mixedSourceDims);
   return mlir::vector::maskOperation(builder, transferReadOp, mask)
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index d41d86117793b..ec227b46b409e 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -940,9 +940,9 @@ module attributes {transform.with_named_sequence} {
 ///----------------------------------------------------------------------------------------
 
 // CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
-// CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:      %[[ARG_1:.*]]: tensor<?x?x16x2xf32>
-func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
 // CHECK: %[[C0:.*]] = arith.constant 0
 // CHECK: %[[C01:.*]] = arith.constant 0
 // CHECK: %[[C02:.*]] = arith.constant 0
@@ -956,15 +956,93 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: t
 // CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
 // CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
 // CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]]
+// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[SRC]]
 // CHECK: return %[[write0]]
- %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+ %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
  return %ret : tensor<?x?xf32>
 }
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2, 4, 16] : !transform.any_op
+   transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+  // CHECK: %[[C0:.*]] = arith.constant 0
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+  // CHECK: %[[C01:.*]] = arith.constant 0
+  // CHECK: %[[C02:.*]] = arith.constant 0
+  // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[CNST14:.*]] = arith.constant 1
+  // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+  // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+  // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x[16]x2xi1>
+  // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32>
+  // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32>
+  // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32>
+  // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1>
+  // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+  // CHECK: return %[[WRITE]]
+  %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+  return %ret : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+   %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op
+   transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x?x2xf32>
+func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> {
+  // CHECK: %[[C0:.*]] = arith.constant 0
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+  // CHECK: %[[C01:.*]] = arith.constant 0
+  // CHECK: %[[C02:.*]] = arith.constant 0
+  // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x?x2xf32>
+  // CHECK: %[[C1_2:.*]] = arith.constant 1
+  // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1_2]] : tensor<?x?x?x2xf32>
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[DIM_2:.*]] = tensor.dim %[[SRC]], %[[C2]] : tensor<?x?x?x2xf32>
+  // CHECK: %[[C2_1:.*]] = arith.constant 2 : index
+  // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[DIM_2]], %[[C2_1]] : vector<2x1x[16]x2xi1>
+  // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x?x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32>
+  // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32>
+  // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32>
+  // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1>
+  // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+  // CHECK: return %[[WRITE]]
+
+  %vs = vector.vscale
+  %c16 = arith.constant 16 : index
+  %tile_size = arith.muli %vs, %c16 : index
+
+  %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [%tile_size, 2] into %dest : tensor<?x?x?x2xf32> -> tensor<?x?xf32>
+  return %ret : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+   %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op
    transform.yield
  }
 }
@@ -997,7 +1075,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16, 512, 128] : !transform.any_op
     transform.yield
   }
 }
@@ -1022,7 +1100,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op
     transform.yield
   }
  }
@@ -1047,7 +1125,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op
     transform.yield
   }
 }
@@ -1170,7 +1248,7 @@ module attributes {transform.with_named_sequence} {
 
 func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, [2]] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
 //  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32

>From 8509428b05830bf6ac9e40fa4053c6554014a3e6 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 24 Jul 2025 20:52:12 +0000
Subject: [PATCH 02/12] fixup! [mlir][linalg] Enable scalable vectorization of
 linalg.unpack (WIP)

Remove leftover code + comments
---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 17e0a728e7d03..a8a184b972668 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1953,7 +1953,6 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   // 2. if outer_dims_perms is present: do that permutation on writeVectorSizes.
   // 3. multiply all the locations in vectorSize pointed by innerDimPos by the
   //    innerTiles attribute value.
-  // SmallVector<int64_t> writeVectorSizes(inputVectorSizes);
   if (writeVectorSizes.empty()) {
     if (ShapedType::isDynamicShape(sourceShape))
       return failure();
@@ -2141,9 +2140,6 @@ static LogicalResult
 vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
                               ArrayRef<int64_t> inputVectorSizes) {
 
-  // FIXME!!!
-  return success();
-
   if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) {
         return !getConstantIntValue(res).has_value();
       })) {

>From f9766cf2118487d3f017e89ecb0b3d69eebf62af Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 25 Jul 2025 09:24:12 +0000
Subject: [PATCH 03/12] fixup! fixup! [mlir][linalg] Enable scalable
 vectorization of linalg.unpack (WIP)

Fix pre-condition calculation
---
 .../Linalg/Transforms/Vectorization.cpp       | 43 ++++++++++++++-----
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index a8a184b972668..412553c0a41e6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2135,24 +2135,45 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
   return success();
 }
 
-/// Need to check if the inner-tiles are static/constant.
+//// This hook considers two cases:
+///   (1) If the input-vector-sizes are empty, then the vector sizes will be
+///       infered. This is only possible when all shapes are static.
+///   (2) If the input-vector-sizes are non-empty (i.e. user provided), then
+///       carry out basic sanity-checking.
 static LogicalResult
 vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
                               ArrayRef<int64_t> inputVectorSizes) {
+  // If there are no input vector sizes and all shapes are static, there is
+  // nothing left to check.
+  if (inputVectorSizes.empty() && unpackOp.getDestType().hasStaticShape() &&
+      unpackOp.getSourceType().hasStaticShape())
+    return success();
 
-  if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) {
-        return !getConstantIntValue(res).has_value();
-      })) {
-    LDBG() << "Inner-tiles must be constant: " << unpackOp;
+  // The input vector sizes must be equal to:
+  //  * read-vector-rank + write-vector-rank
+  if (!inputVectorSizes.empty()) {
+    if (inputVectorSizes.size() !=
+        unpackOp.getDestRank() + unpackOp.getSourceRank()) {
+      LDBG("Incorrect number of input vector sizes");
+      return failure();
+    }
+  }
+
+  // Check the vector sizes for the write operation.
+  if (failed(vector::isValidMaskedInputVector(
+          unpackOp.getDestType().getShape(),
+          inputVectorSizes.take_back(unpackOp.getDestRank())))) {
+    LDBG("Incorrect number of input vector sizes");
     return failure();
   }
-  ArrayRef<int64_t> resultShape = unpackOp.getDestType().getShape();
-  bool satisfyEmptyCond = inputVectorSizes.empty() &&
-                          unpackOp.getDestType().hasStaticShape() &&
-                          unpackOp.getSourceType().hasStaticShape();
-  if (!satisfyEmptyCond &&
-      failed(vector::isValidMaskedInputVector(resultShape, inputVectorSizes)))
+
+  // Check the vector sizes for the read operation.
+  if (failed(vector::isValidMaskedInputVector(
+          unpackOp.getSourceType().getShape(),
+          inputVectorSizes.take_front(unpackOp.getSourceRank())))) {
+    LDBG("Incorrect number of input vector sizes");
     return failure();
+  }
 
   return success();
 }

>From fdcf92ac2589ba579ce496e1ad651ac5d0413e8b Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 25 Jul 2025 10:20:12 +0000
Subject: [PATCH 04/12] fixup! fixup! [mlir][linalg] Enable scalable
 vectorization of linalg.unpack (WIP)

Improve documentation + fix test after rebasing on top of
* https://github.com/llvm/llvm-project/pull/150602
---
 .../Linalg/Transforms/Vectorization.cpp       | 79 +++++++++----------
 .../Linalg/vectorization/linalg-ops.mlir      | 41 ++++------
 2 files changed, 52 insertions(+), 68 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 412553c0a41e6..5d9284f1e1d1a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1900,6 +1900,13 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                           ArrayRef<int64_t> inputVectorSizes,
                           ArrayRef<bool> inputScalableVecDims,
                           SmallVectorImpl<Value> &newResults) {
+  if (!inputVectorSizes.empty()) {
+    assert(inputVectorSizes.size() ==
+               unpackOp.getDestRank() + unpackOp.getSourceRank() &&
+           "Invalid number of input vector sizes!");
+    assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
+           "Incompatible number of vector sizes and vector scalable flags!");
+  }
 
   // TODO: Introduce a parent class that will handle the insertion point update.
   OpBuilder::InsertionGuard g(rewriter);
@@ -1915,44 +1922,41 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
 
   auto destSize = unpackOp.getDestRank();
 
-  if (!inputVectorSizes.empty()) {
-    assert(inputVectorSizes.size() == destSize + sourceShape.size() &&
-           "Incorrect number of input vector sizes");
-  }
-
-  SmallVector<bool> readScalableVectorFlags;
-  SmallVector<bool> writeScalableVectorFlags;
+  // 1. Obtain vector sizes for the read and write operation.s
   SmallVector<int64_t> readVectorSizes;
   SmallVector<int64_t> writeVectorSizes;
+  SmallVector<bool> readScalableVectorFlags;
+  SmallVector<bool> writeScalableVectorFlags;
 
-  // Split input-vector-sizes into vector sizes for the read and write
-  // operations.
+  // CASE 1: Vector sizes are user-specified.
+  // 1.0 This is the trivial case, simply split the input vector sizes.
   if (!inputVectorSizes.empty()) {
     readVectorSizes.append(inputVectorSizes.begin(),
                            inputVectorSizes.begin() + sourceShape.size());
     writeVectorSizes.append(inputVectorSizes.begin() + sourceShape.size(),
                             inputVectorSizes.end());
-  }
-  if (!inputScalableVecDims.empty()) {
     readScalableVectorFlags.append(inputScalableVecDims.begin(),
                                    inputScalableVecDims.begin() +
                                        sourceShape.size());
     writeScalableVectorFlags.append(inputScalableVecDims.begin() +
                                         sourceShape.size(),
                                     inputScalableVecDims.end());
-  } else {
-    readScalableVectorFlags = SmallVector<bool>(sourceShape.size(), false);
-    writeScalableVectorFlags = SmallVector<bool>(destSize, false);
   }
 
-  // writeVectorSizes is the shape of the vector that will be used to do final
-  // write on the destination tensor. It is set like this: Let's say the
-  // source tensor is rank 'M' and the dest tensor rank 'N', where N <= M.
-  // Thus:
-  // 1. writeVectorSizes = sourceShape.take_front(N)
-  // 2. if outer_dims_perms is present: do that permutation on writeVectorSizes.
-  // 3. multiply all the locations in vectorSize pointed by innerDimPos by the
-  //    innerTiles attribute value.
+  // CASE 2: Vector sizes have to be inferred.
+  //
+  // 1.1 Infer vector sizes for the write operation.
+  //
+  // Let:
+  //    * rank(source tensor) = 'M'
+  //    * rank(dest tensor) = 'N',
+  // and N <= M. The steps are:
+  //  1. writeVectorSizes = sourceShape.take_front(N)
+  //  2. Multiply all the locations in writeVectorSize pointed by inner_dims_pos
+  //     by the corresponding values from the `inner_tiles` attribute value.
+  //  3. If outer_dims_perms is present, permutate writeVectorSizes accordingly.
+  //
+  // Note, this will only work when all sizes are static!
   if (writeVectorSizes.empty()) {
     if (ShapedType::isDynamicShape(sourceShape))
       return failure();
@@ -1966,28 +1970,17 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
     useInBoundsInsteadOfMasking = true;
   }
 
-  // readVectorSizes is the size of tensor used to read and apply mask. It is
-  // set like this: Let's say the vectorSize (VS) array is size 'N' and
-  // the sourceShape(SS) is 'M' where M >= N and InnerTileSizes (IT) of
-  // size M-N
-  // Thus:
-  // - initially: readVectorSizes = vectorInputSizes
-  // - Divide all the readMaskShape locations pointed by innerDimPos
-  //   by the innerTileSize attribute value.
-  // - if outer_dims_perms is present: do that permutation on readVectorSizes.
-  // - Append the remaining shape from SS
-  // E.g. let's say let's say unpackTensorType.getShape() = <8x8x32x16>
-  // inner Dim Pos = [0, 1] and Inner Tiles = [32, 16], vector_sizes are [512,
-  // 128] and outer_dims_perm is [1, 0] then read shape is:
-  //   ReadVectorSizes(initial): [512, 128]
-  //   Final Value(after innerDim Adjustment): [512/32, 128/16]
-  //                                           = [16, 8]
-  //   After applying outer_dims_perm: [8, 16]
-  //   After appending the rest of the sourceShape: [8, 16, 32, 16]
-
+  // 1.2 Infer vector sizes for the read operation.
+  //
+  // The steps are:
+  //  1. readVectorSizes = vectorInputSizes
+  //  2. Take readVectorSizes from 1. and divide all locations pointed by
+  //     the inner_dims_pos attribyte by the `inner_tiles` attribute value.
+  //  3. If outer_dims_perms is present, permutate readVectorSizes accordingly.
+  //  4. Append the remaining sizes from the source tensor.
+  //
+  // Note, this will only work when all sizes are static!
   if (readVectorSizes.empty()) {
-    // Compute read-vector-sizes based on the write-vector-sizes and inner tile
-    // sizes. Note, this will only work when all sizes are static.
     readVectorSizes = writeVectorSizes;
     for (auto [index, size] : enumerate(innerTiles)) {
       readVectorSizes[innerDimPos[index]] =
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index ec227b46b409e..fcb8b02d3faa3 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -943,23 +943,22 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-// CHECK: %[[C0:.*]] = arith.constant 0
-// CHECK: %[[C01:.*]] = arith.constant 0
-// CHECK: %[[C02:.*]] = arith.constant 0
-// CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG_1]], %[[C02]] : tensor<?x?x16x2xf32>
-// CHECK: %[[C1:.*]] = arith.constant 1
-// CHECK: %[[DIM6:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : tensor<?x?x16x2xf32>
-// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
-// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
-// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
-// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
-// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
-// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
-// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[SRC]]
-// CHECK: return %[[write0]]
- %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
- return %ret : tensor<?x?xf32>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
+  // CHECK: %[[DIM_0:.*]] = tensor.dim %[[SRC]], %[[C0_1]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[C1:.*]] = arith.constant 1
+  // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1]] : tensor<?x?x16x2xf32>
+  // CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+  // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+  // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM_0]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
+  // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
+  // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
+  // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x16xf32> to vector<4x16xf32>
+  // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
+  // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]]
+  // CHECK: return %[[WRITE]]
+  %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+  return %ret : tensor<?x?xf32>
 }
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -975,10 +974,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x16x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-  // CHECK: %[[C0:.*]] = arith.constant 0
-  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
   // CHECK: %[[C01:.*]] = arith.constant 0
   // CHECK: %[[C02:.*]] = arith.constant 0
@@ -1011,10 +1006,6 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?x?x2xf32>
 func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> {
-  // CHECK: %[[C0:.*]] = arith.constant 0
-  // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32>
-  // CHECK: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32>
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
   // CHECK: %[[C01:.*]] = arith.constant 0
   // CHECK: %[[C02:.*]] = arith.constant 0

>From a8452e9c7b97abd8072a0752ec03325a5d2255e0 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 25 Jul 2025 10:49:19 +0000
Subject: [PATCH 05/12] fixup! fixup! fixup! [mlir][linalg] Enable scalable
 vectorization of linalg.unpack (WIP)

Remove unintended test change
---
 mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index fcb8b02d3faa3..9c9ddb54d1d5f 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1239,7 +1239,7 @@ module attributes {transform.with_named_sequence} {
 
 func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, [2]] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
 //  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32

>From 771a1f165d7e996b26b7eb327222fe706123e728 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 25 Jul 2025 10:56:00 +0000
Subject: [PATCH 06/12] fixup! fixup! fixup! fixup! [mlir][linalg] Enable
 scalable vectorization of linalg.unpack (WIP)

Remove TODO
---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 5d9284f1e1d1a..97ef344162d41 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2569,7 +2569,6 @@ vectorizeScalableVectorPrecondition(Operation *op,
   if (numOfScalableDims == 0)
     return success();
 
-  // TODO: Check the following!
   auto linalgOp = dyn_cast<LinalgOp>(op);
 
   // Cond 1: Reject Ops that don't implement the LinalgOp interface, with the

>From 8a06fd57be094f4bbe50df07afbe29e87cd45e1e Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 25 Jul 2025 10:57:56 +0000
Subject: [PATCH 07/12] fixup! fixup! fixup! fixup! fixup! [mlir][linalg]
 Enable scalable vectorization of linalg.unpack (WIP)

Fix comment
---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 97ef344162d41..b4da8040f761d 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1973,7 +1973,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   // 1.2 Infer vector sizes for the read operation.
   //
   // The steps are:
-  //  1. readVectorSizes = vectorInputSizes
+  //  1. readVectorSizes = writeVectorSizes
   //  2. Take readVectorSizes from 1. and divide all locations pointed by
   //     the inner_dims_pos attribyte by the `inner_tiles` attribute value.
   //  3. If outer_dims_perms is present, permutate readVectorSizes accordingly.

>From d2f14b9b61ef3f1ffa2f618e068a9999b550d1e1 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Mon, 28 Jul 2025 09:22:47 +0000
Subject: [PATCH 08/12] Simplify code as per comments from HanHan

---
 .../Linalg/Transforms/Vectorization.cpp       | 146 +++++++-----------
 mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp |   3 +-
 2 files changed, 60 insertions(+), 89 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index b4da8040f761d..607a3df3722c3 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1879,22 +1879,35 @@ static VectorType getCollapsedVecType(VectorType type,
   return VectorType::get(newShape, type.getElementType(), newScalableFlags);
 }
 
-/// Vectorize a `linalg::UnPackOp` to these 4 Ops:
-///   Vector::TransferReadOp - Reads a vector from the source tensor
-///   vector::TransposeOp - Transpose the Source tensor
-///   ShapeCastOp - Reshape the data based on the target.
-///   vector::TransferWriteOp. - Write the result vector back to the destination
-///   tensor.
-///   If the vector sizes are not provided:
-/// Vectorize `linalg.unpack %src into %dest` as:
-///   // Reads a vector from the source tensor
-///   %read = vector.transfer_read  %src
-///   // Transpose %read as specified in `outer_dims_perm` attribute
-///   %tr = vector.transpose %read
-///   // Reshape the data based on the target
-///   %sc = vector.shape_cast %tr
-///   // Write the result vector to the destination tensor.
-///   vector.transfer_write %sc into %dest
+/// Vectorize `linalg.unpack` into:
+///   * xfer_read -> vector.transpose -> vector.shape_cast -> xfer_write
+///
+/// The input-vector-sizes specify both the read and the write vector
+/// sizes and are passed as one array covering both operations, i.e.:
+///
+///  input-vector-sizes = [1, 1, 8, [8],  8, [8]]
+///                        \         /    \    /
+///                        read-sizes   write-sizes
+///
+/// (for brefity, in the diagram,
+///    * input-vector-sizes = `inputVectorSizes` + `inputScalableDims`
+/// )
+///
+/// If the vector sizes are not provided:
+///  * the vector sizes are determined by the operands,
+///  * the inBounds attribute is used instead of masking.
+///
+/// EXAMPLE (no vector sizes):
+/// ```
+///   %unpack = linalg.unpack  %src
+///    inner_dims_pos = [0, 1]
+///    inner_tiles = [8, 8]
+///    into %dest : tensor<1x1x8x8xf32> -> tensor<8x8xf32>
+/// ```
+/// is vectorized as:
+/// ```
+///   vector.transfer_write %sc into %dest : vector<8x8xf32>, tensor<8x8xf32>
+/// ```
 static LogicalResult
 vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                           ArrayRef<int64_t> inputVectorSizes,
@@ -1914,22 +1927,19 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
 
   RankedTensorType unpackTensorType = unpackOp.getSourceType();
 
-  ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
-  ArrayRef<int64_t> innerTiles = unpackOp.getStaticInnerTiles();
   ArrayRef<int64_t> sourceShape = unpackTensorType.getShape();
+  ArrayRef<int64_t> destShape = unpackOp.getDestType().getShape();
   bool useInBoundsInsteadOfMasking = false;
-  ArrayRef<int64_t> outerDimsPerm = unpackOp.getOuterDimsPerm();
 
-  auto destSize = unpackOp.getDestRank();
+  Location loc = unpackOp->getLoc();
 
-  // 1. Obtain vector sizes for the read and write operation.s
+  // 1. Obtain vector sizes for the read and write operations.
   SmallVector<int64_t> readVectorSizes;
   SmallVector<int64_t> writeVectorSizes;
   SmallVector<bool> readScalableVectorFlags;
   SmallVector<bool> writeScalableVectorFlags;
 
-  // CASE 1: Vector sizes are user-specified.
-  // 1.0 This is the trivial case, simply split the input vector sizes.
+  // CASE 1.1: Vector sizes are user-specified.
   if (!inputVectorSizes.empty()) {
     readVectorSizes.append(inputVectorSizes.begin(),
                            inputVectorSizes.begin() + sourceShape.size());
@@ -1943,76 +1953,33 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                                     inputScalableVecDims.end());
   }
 
-  // CASE 2: Vector sizes have to be inferred.
-  //
-  // 1.1 Infer vector sizes for the write operation.
-  //
-  // Let:
-  //    * rank(source tensor) = 'M'
-  //    * rank(dest tensor) = 'N',
-  // and N <= M. The steps are:
-  //  1. writeVectorSizes = sourceShape.take_front(N)
-  //  2. Multiply all the locations in writeVectorSize pointed by inner_dims_pos
-  //     by the corresponding values from the `inner_tiles` attribute value.
-  //  3. If outer_dims_perms is present, permutate writeVectorSizes accordingly.
-  //
-  // Note, this will only work when all sizes are static!
+  // CASE 1. 2: Vector sizes have to be inferred.
   if (writeVectorSizes.empty()) {
-    if (ShapedType::isDynamicShape(sourceShape))
+    if (ShapedType::isDynamicShape(destShape) ||
+        ShapedType::isDynamicShape(sourceShape))
       return failure();
 
-    llvm::append_range(writeVectorSizes, sourceShape.take_front(destSize));
-    if (!outerDimsPerm.empty())
-      applyPermutationToVector(writeVectorSizes, outerDimsPerm);
-    for (auto [i, pos] : llvm::enumerate(innerDimPos))
-      writeVectorSizes[pos] *= innerTiles[i];
-
+    readVectorSizes.assign(sourceShape.begin(), sourceShape.end());
+    writeVectorSizes.assign(destShape.begin(), destShape.end());
     useInBoundsInsteadOfMasking = true;
   }
 
-  // 1.2 Infer vector sizes for the read operation.
-  //
-  // The steps are:
-  //  1. readVectorSizes = writeVectorSizes
-  //  2. Take readVectorSizes from 1. and divide all locations pointed by
-  //     the inner_dims_pos attribyte by the `inner_tiles` attribute value.
-  //  3. If outer_dims_perms is present, permutate readVectorSizes accordingly.
-  //  4. Append the remaining sizes from the source tensor.
-  //
-  // Note, this will only work when all sizes are static!
-  if (readVectorSizes.empty()) {
-    readVectorSizes = writeVectorSizes;
-    for (auto [index, size] : enumerate(innerTiles)) {
-      readVectorSizes[innerDimPos[index]] =
-          llvm::divideCeil(readVectorSizes[innerDimPos[index]], size);
-    }
-    if (!outerDimsPerm.empty()) {
-      applyPermutationToVector(readVectorSizes, outerDimsPerm);
-    }
-    readVectorSizes.append(sourceShape.begin() + writeVectorSizes.size(),
-                           sourceShape.end());
-  }
-
-  Location loc = unpackOp->getLoc();
-
+  // 2. Generate the read operation.
   auto padValue = arith::ConstantOp::create(
       rewriter, loc,
       rewriter.getZeroAttr(unpackOp.getSourceType().getElementType()));
-
-  // Read result, mask if necessary. If transferReadOp shape is not equal
-  // to shape of source, then a mask is necessary.
   Value readResult = vector::createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue,
       /*useInBoundsInsteadOfMasking=*/false, readScalableVectorFlags);
 
+  // 3. Generate the transpose operation.
   PackingMetadata packMetadata;
   SmallVector<int64_t> lastDimToInsertPosPerm =
       getUnPackInverseSrcPerm(unpackOp, packMetadata);
-  // Transpose the appropriate rows to match output.
   vector::TransposeOp transposeOp = vector::TransposeOp::create(
       rewriter, loc, readResult, lastDimToInsertPosPerm);
 
-  // Collapse the vector to the size required by result.
+  // 3. Generate the shape_cast operation.
   VectorType collapsedVecType = getCollapsedVecType(
       transposeOp.getType(),
       getSymbolLessAffineMaps(convertReassociationIndicesToExprs(
@@ -2020,6 +1987,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   vector::ShapeCastOp shapeCastOp = vector::ShapeCastOp::create(
       rewriter, loc, collapsedVecType, transposeOp->getResult(0));
 
+  // 4. Generate the write operation.
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(),
       /*writeIndices=*/{}, useInBoundsInsteadOfMasking);
@@ -2147,24 +2115,24 @@ vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
   if (!inputVectorSizes.empty()) {
     if (inputVectorSizes.size() !=
         unpackOp.getDestRank() + unpackOp.getSourceRank()) {
-      LDBG("Incorrect number of input vector sizes");
+      LDBG() << "Incorrect number of input vector sizes";
       return failure();
     }
   }
 
-  // Check the vector sizes for the write operation.
+  // Check the vector sizes for the read operation.
   if (failed(vector::isValidMaskedInputVector(
-          unpackOp.getDestType().getShape(),
-          inputVectorSizes.take_back(unpackOp.getDestRank())))) {
-    LDBG("Incorrect number of input vector sizes");
+          unpackOp.getSourceType().getShape(),
+          inputVectorSizes.take_front(unpackOp.getSourceRank())))) {
+    LDBG() << "Invalid vector sizes for the read operation";
     return failure();
   }
 
-  // Check the vector sizes for the read operation.
+  // Check the vector sizes for the write operation.
   if (failed(vector::isValidMaskedInputVector(
-          unpackOp.getSourceType().getShape(),
-          inputVectorSizes.take_front(unpackOp.getSourceRank())))) {
-    LDBG("Incorrect number of input vector sizes");
+          unpackOp.getDestType().getShape(),
+          inputVectorSizes.take_back(unpackOp.getDestRank())))) {
+    LDBG() << "Invalid vector sizes for the write operation";
     return failure();
   }
 
@@ -2554,8 +2522,12 @@ vectorizePadOpPrecondition(tensor::PadOp padOp,
   return success();
 }
 
-/// Preconditions for scalable vectors. This is quite restrictive - it models
-/// the fact that in practice we would only make selected dimensions scalable.
+/// Preconditions for scalable vectors.
+///
+/// For Ops implementing the LinalgOp interface, this is quite restrictive - it
+/// models the fact that in practice we would only make selected dimensions
+/// scalable. For other Ops (e.g. `linalg.unpack`), this will succed
+/// unconditionally - we are yet to identify meaningful conditions.
 static LogicalResult
 vectorizeScalableVectorPrecondition(Operation *op,
                                     ArrayRef<int64_t> inputVectorSizes,
@@ -2574,7 +2546,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
   // Cond 1: Reject Ops that don't implement the LinalgOp interface, with the
   // exception of UnpackOp for which there is a dedicated hook.
   if (!linalgOp) {
-    return isa<linalg::UnPackOp>(op) ? success() : failure();
+    return success(isa<linalg::UnPackOp>(op));
   }
 
   // Cond 2: There's been no need for more than 2 scalable dims so far
@@ -2673,7 +2645,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
                  isa<linalg::MatmulTransposeAOp>(op) ||
                  isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
                  isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) ||
-                 isa<linalg::UnPackOp>(op) || hasReductionIterator(linalgOp));
+                 hasReductionIterator(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 34b1bdbd9e010..6e2fa35e1279a 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -387,8 +387,7 @@ vector::isValidMaskedInputVector(ArrayRef<int64_t> shape,
                              staticSize <= inputSize;
                     })) {
     LDBG() << "Input vector sizes must be greater than or equal to iteration "
-              "space "
-              "static sizes";
+              "space static sizes";
     return failure();
   }
   return success();

>From acfe43252e32a1f13dfb3c6a965024d500dccf93 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 30 Jul 2025 13:26:26 +0000
Subject: [PATCH 09/12] Address the remaining comments from HanHan

---
 .../Linalg/Transforms/Vectorization.cpp       | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 607a3df3722c3..79c37ca1088f1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1939,22 +1939,21 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   SmallVector<bool> readScalableVectorFlags;
   SmallVector<bool> writeScalableVectorFlags;
 
-  // CASE 1.1: Vector sizes are user-specified.
   if (!inputVectorSizes.empty()) {
-    readVectorSizes.append(inputVectorSizes.begin(),
+    // CASE 1.1: Vector sizes are user-specified.
+    readVectorSizes.assign(inputVectorSizes.begin(),
                            inputVectorSizes.begin() + sourceShape.size());
-    writeVectorSizes.append(inputVectorSizes.begin() + sourceShape.size(),
+    writeVectorSizes.assign(inputVectorSizes.begin() + sourceShape.size(),
                             inputVectorSizes.end());
-    readScalableVectorFlags.append(inputScalableVecDims.begin(),
+    readScalableVectorFlags.assign(inputScalableVecDims.begin(),
                                    inputScalableVecDims.begin() +
                                        sourceShape.size());
-    writeScalableVectorFlags.append(inputScalableVecDims.begin() +
+    writeScalableVectorFlags.assign(inputScalableVecDims.begin() +
                                         sourceShape.size(),
                                     inputScalableVecDims.end());
-  }
-
-  // CASE 1. 2: Vector sizes have to be inferred.
-  if (writeVectorSizes.empty()) {
+  } else {
+    // CASE 1.2: Vector sizes are inferred from the static input tensor
+    // shapes.
     if (ShapedType::isDynamicShape(destShape) ||
         ShapedType::isDynamicShape(sourceShape))
       return failure();
@@ -2112,12 +2111,11 @@ vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
 
   // The input vector sizes must be equal to:
   //  * read-vector-rank + write-vector-rank
-  if (!inputVectorSizes.empty()) {
-    if (inputVectorSizes.size() !=
-        unpackOp.getDestRank() + unpackOp.getSourceRank()) {
-      LDBG() << "Incorrect number of input vector sizes";
-      return failure();
-    }
+  if (!inputVectorSizes.empty() &&
+      (inputVectorSizes.size() !=
+       unpackOp.getDestRank() + unpackOp.getSourceRank())) {
+    LDBG() << "Incorrect number of input vector sizes";
+    return failure();
   }
 
   // Check the vector sizes for the read operation.

>From 65488765fa9b71606c41515fce1b6101aab270cf Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 31 Jul 2025 12:13:01 +0000
Subject: [PATCH 10/12] Simplify to only require read-vector-sizes.

---
 .../Linalg/Transforms/Vectorization.cpp       | 44 ++++---------------
 .../Linalg/vectorization/linalg-ops.mlir      | 12 ++---
 2 files changed, 15 insertions(+), 41 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 79c37ca1088f1..e0531bd3c3f37 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1879,19 +1879,12 @@ static VectorType getCollapsedVecType(VectorType type,
   return VectorType::get(newShape, type.getElementType(), newScalableFlags);
 }
 
-/// Vectorize `linalg.unpack` into:
+/// Vectorize `linalg.unpack` as:
 ///   * xfer_read -> vector.transpose -> vector.shape_cast -> xfer_write
 ///
-/// The input-vector-sizes specify both the read and the write vector
-/// sizes and are passed as one array covering both operations, i.e.:
-///
-///  input-vector-sizes = [1, 1, 8, [8],  8, [8]]
-///                        \         /    \    /
-///                        read-sizes   write-sizes
-///
-/// (for brefity, in the diagram,
-///    * input-vector-sizes = `inputVectorSizes` + `inputScalableDims`
-/// )
+/// The input-vector-sizes specify the read vector sizes (i.e. the vector sizes
+/// for the xfer_read operation). This is sufficient to infer the other vector
+/// sizes required here.
 ///
 /// If the vector sizes are not provided:
 ///  * the vector sizes are determined by the operands,
@@ -1914,8 +1907,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
                           ArrayRef<bool> inputScalableVecDims,
                           SmallVectorImpl<Value> &newResults) {
   if (!inputVectorSizes.empty()) {
-    assert(inputVectorSizes.size() ==
-               unpackOp.getDestRank() + unpackOp.getSourceRank() &&
+    assert(inputVectorSizes.size() == unpackOp.getSourceRank() &&
            "Invalid number of input vector sizes!");
     assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
            "Incompatible number of vector sizes and vector scalable flags!");
@@ -1935,22 +1927,15 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
 
   // 1. Obtain vector sizes for the read and write operations.
   SmallVector<int64_t> readVectorSizes;
-  SmallVector<int64_t> writeVectorSizes;
   SmallVector<bool> readScalableVectorFlags;
-  SmallVector<bool> writeScalableVectorFlags;
 
   if (!inputVectorSizes.empty()) {
     // CASE 1.1: Vector sizes are user-specified.
     readVectorSizes.assign(inputVectorSizes.begin(),
                            inputVectorSizes.begin() + sourceShape.size());
-    writeVectorSizes.assign(inputVectorSizes.begin() + sourceShape.size(),
-                            inputVectorSizes.end());
     readScalableVectorFlags.assign(inputScalableVecDims.begin(),
                                    inputScalableVecDims.begin() +
                                        sourceShape.size());
-    writeScalableVectorFlags.assign(inputScalableVecDims.begin() +
-                                        sourceShape.size(),
-                                    inputScalableVecDims.end());
   } else {
     // CASE 1.2: Vector sizes are inferred from the static input tensor
     // shapes.
@@ -1959,7 +1944,6 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
       return failure();
 
     readVectorSizes.assign(sourceShape.begin(), sourceShape.end());
-    writeVectorSizes.assign(destShape.begin(), destShape.end());
     useInBoundsInsteadOfMasking = true;
   }
 
@@ -2109,31 +2093,21 @@ vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp,
       unpackOp.getSourceType().hasStaticShape())
     return success();
 
-  // The input vector sizes must be equal to:
-  //  * read-vector-rank + write-vector-rank
+  // The number of input vector sizes must be equal to:
+  //  * read-vector-rank
   if (!inputVectorSizes.empty() &&
-      (inputVectorSizes.size() !=
-       unpackOp.getDestRank() + unpackOp.getSourceRank())) {
+      (inputVectorSizes.size() != unpackOp.getSourceRank())) {
     LDBG() << "Incorrect number of input vector sizes";
     return failure();
   }
 
   // Check the vector sizes for the read operation.
   if (failed(vector::isValidMaskedInputVector(
-          unpackOp.getSourceType().getShape(),
-          inputVectorSizes.take_front(unpackOp.getSourceRank())))) {
+          unpackOp.getSourceType().getShape(), inputVectorSizes))) {
     LDBG() << "Invalid vector sizes for the read operation";
     return failure();
   }
 
-  // Check the vector sizes for the write operation.
-  if (failed(vector::isValidMaskedInputVector(
-          unpackOp.getDestType().getShape(),
-          inputVectorSizes.take_back(unpackOp.getDestRank())))) {
-    LDBG() << "Invalid vector sizes for the write operation";
-    return failure();
-  }
-
   return success();
 }
 
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 9c9ddb54d1d5f..095810fe0451e 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -963,7 +963,7 @@ func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: te
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2, 4, 16] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2] : !transform.any_op
    transform.yield
  }
 }
@@ -995,7 +995,7 @@ func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op
    transform.yield
  }
 }
@@ -1033,7 +1033,7 @@ func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest
 module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2] : !transform.any_op
    transform.yield
  }
 }
@@ -1066,7 +1066,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16, 512, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16] : !transform.any_op
     transform.yield
   }
 }
@@ -1091,7 +1091,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op
     transform.yield
   }
  }
@@ -1116,7 +1116,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16] : !transform.any_op
     transform.yield
   }
 }

>From 656f7ef60040708f1528a7521ebade7156092a93 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 1 Aug 2025 12:35:07 +0000
Subject: [PATCH 11/12] Final tweaks

---
 .../Linalg/Transforms/Vectorization.cpp       | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e0531bd3c3f37..a2044816cd349 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1887,7 +1887,7 @@ static VectorType getCollapsedVecType(VectorType type,
 /// sizes required here.
 ///
 /// If the vector sizes are not provided:
-///  * the vector sizes are determined by the operands,
+///  * the vector sizes are determined from the input tensor static shape.
 ///  * the inBounds attribute is used instead of masking.
 ///
 /// EXAMPLE (no vector sizes):
@@ -1899,7 +1899,14 @@ static VectorType getCollapsedVecType(VectorType type,
 /// ```
 /// is vectorized as:
 /// ```
-///   vector.transfer_write %sc into %dest : vector<8x8xf32>, tensor<8x8xf32>
+///   %read = vector.transfer_read %src
+///     : tensor<1x1x8x8xf32>, vector<1x1x8x8xf32>
+///   %tr = vector.transpose %read, [0, 2, 1, 3]
+///     : vector<1x1x8x8xf32> to vector<1x8x1x8xf32>
+///   %sc = vector.shape_cast %tr
+///     : vector<1x8x1x8xf32> to vector<8x8xf32>
+///   %vector = vector.transfer_write %sc into %dest
+///     : vector<8x8xf32>, tensor<8x8xf32>
 /// ```
 static LogicalResult
 vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
@@ -1920,49 +1927,39 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   RankedTensorType unpackTensorType = unpackOp.getSourceType();
 
   ArrayRef<int64_t> sourceShape = unpackTensorType.getShape();
-  ArrayRef<int64_t> destShape = unpackOp.getDestType().getShape();
   bool useInBoundsInsteadOfMasking = false;
 
   Location loc = unpackOp->getLoc();
 
-  // 1. Obtain vector sizes for the read and write operations.
-  SmallVector<int64_t> readVectorSizes;
-  SmallVector<bool> readScalableVectorFlags;
+  // Obtain vector sizes for the read operation.
+  SmallVector<int64_t> readVectorSizes(inputVectorSizes);
+  SmallVector<bool> readScalableVectorFlags(inputScalableVecDims);
 
-  if (!inputVectorSizes.empty()) {
-    // CASE 1.1: Vector sizes are user-specified.
-    readVectorSizes.assign(inputVectorSizes.begin(),
-                           inputVectorSizes.begin() + sourceShape.size());
-    readScalableVectorFlags.assign(inputScalableVecDims.begin(),
-                                   inputScalableVecDims.begin() +
-                                       sourceShape.size());
-  } else {
-    // CASE 1.2: Vector sizes are inferred from the static input tensor
-    // shapes.
-    if (ShapedType::isDynamicShape(destShape) ||
-        ShapedType::isDynamicShape(sourceShape))
+  // In the absence of input-vector-sizes, use the _static_ input tensor shape.
+  if (inputVectorSizes.empty()) {
+    if (ShapedType::isDynamicShape(sourceShape))
       return failure();
 
     readVectorSizes.assign(sourceShape.begin(), sourceShape.end());
     useInBoundsInsteadOfMasking = true;
   }
 
-  // 2. Generate the read operation.
+  // -- Generate the read operation --
   auto padValue = arith::ConstantOp::create(
       rewriter, loc,
       rewriter.getZeroAttr(unpackOp.getSourceType().getElementType()));
   Value readResult = vector::createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue,
-      /*useInBoundsInsteadOfMasking=*/false, readScalableVectorFlags);
+      useInBoundsInsteadOfMasking, readScalableVectorFlags);
 
-  // 3. Generate the transpose operation.
+  // -- Generate the transpose operation --
   PackingMetadata packMetadata;
   SmallVector<int64_t> lastDimToInsertPosPerm =
       getUnPackInverseSrcPerm(unpackOp, packMetadata);
   vector::TransposeOp transposeOp = vector::TransposeOp::create(
       rewriter, loc, readResult, lastDimToInsertPosPerm);
 
-  // 3. Generate the shape_cast operation.
+  // -- Generate the shape_cast operation --
   VectorType collapsedVecType = getCollapsedVecType(
       transposeOp.getType(),
       getSymbolLessAffineMaps(convertReassociationIndicesToExprs(
@@ -1970,10 +1967,11 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   vector::ShapeCastOp shapeCastOp = vector::ShapeCastOp::create(
       rewriter, loc, collapsedVecType, transposeOp->getResult(0));
 
-  // 4. Generate the write operation.
+  // -- Generate the write operation --
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(),
       /*writeIndices=*/{}, useInBoundsInsteadOfMasking);
+
   newResults.push_back(write->getResult(0));
   return success();
 }

>From 5b8fc0ceb4ccb55d09ad992ad717639c8d0d87b1 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 1 Aug 2025 13:48:59 +0000
Subject: [PATCH 12/12] Fix typo

---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index a2044816cd349..cf65e673a5c44 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2496,7 +2496,7 @@ vectorizePadOpPrecondition(tensor::PadOp padOp,
 ///
 /// For Ops implementing the LinalgOp interface, this is quite restrictive - it
 /// models the fact that in practice we would only make selected dimensions
-/// scalable. For other Ops (e.g. `linalg.unpack`), this will succed
+/// scalable. For other Ops (e.g. `linalg.unpack`), this will succeed
 /// unconditionally - we are yet to identify meaningful conditions.
 static LogicalResult
 vectorizeScalableVectorPrecondition(Operation *op,