[Mlir-commits] [mlir] [mlir][linalg] Add an e2e test for linalg.matmul to ArmSME (PR #72144)

Mon Nov 13 10:02:29 PST 2023

https://github.com/c-rhodes created https://github.com/llvm/llvm-project/pull/72144

This patch adds an integration test lowering a linalg.matmul to SME via
vector.outerproduct.

It's similar to the linalg.matmul_transpose_a e2e test added recently in

  * vector.rank_reducing_subview_patterns
  * vector.lower_transpose

To lower the following sequence (taken from the inner loop):

  %subview = memref.subview %arg0[%arg3, %arg5] [%2, 1] [1, 1] :
    memref<?x?xf32, strided<[?, ?], offset: ?>> to memref<?x1xf32, strided<[?, ?], offset: ?>>
  %mask = vector.create_mask %2, %c1 : vector<[4]x1xi1>
  %0 = vector.transfer_read %subview[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
    memref<?x1xf32, strided<[?, ?], offset: ?>>, vector<[4]x1xf32>
  %1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
  %2 = vector.extract %1[0] : vector<[4]xf32> from vector<1x[4]xf32>

such that the unit dim is dropped and the vector.transfer_read can be
lowered via the generic path. Rank-2 vectors with leading scalable dim
can't be type converted to an array.

>From e0b2fd9c1a896d988626b4a2f57abae04be48fc6 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Thu, 9 Nov 2023 19:44:34 +0000
Subject: [PATCH 1/3] [mlir][vector] Extend vector.transfer_read drop unit dim
 pattern

This patch extends the vector.transfer_read drop unit dim pattern to
support scalable vectors with (non-scalable) unit dims, and dynamic
memrefs. The xfer op can also have a mask of type 'vector.create_mask',
which is rewritten.
---
 .../Transforms/VectorTransferOpTransforms.cpp | 98 +++++++++++++------
 ...ctor-transfer-drop-unit-dims-patterns.mlir | 86 ++++++++++++++++
 2 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index a5f1b28152b9bde..95445f2081ec89c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -260,14 +260,22 @@ void TransferOptimization::storeToLoadForwarding(vector::TransferReadOp read) {
   opToErase.push_back(read.getOperation());
 }
 
+/// Returns a copy of `shape` without unit dims.
+static SmallVector<int64_t> getReducedShape(ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> reducedShape;
+  llvm::copy_if(shape, std::back_inserter(reducedShape),
+                [](int64_t dimSize) { return dimSize != 1; });
+  return reducedShape;
+}
+
 /// Drops unit dimensions from the input MemRefType.
-static MemRefType dropUnitDims(MemRefType inputType, ArrayRef<int64_t> offsets,
-                               ArrayRef<int64_t> sizes,
-                               ArrayRef<int64_t> strides) {
-  SmallVector<int64_t> targetShape = llvm::to_vector(
-      llvm::make_filter_range(sizes, [](int64_t sz) { return sz != 1; }));
+static MemRefType dropUnitDims(MemRefType inputType,
+                               ArrayRef<OpFoldResult> offsets,
+                               ArrayRef<OpFoldResult> sizes,
+                               ArrayRef<OpFoldResult> strides) {
   Type rankReducedType = memref::SubViewOp::inferRankReducedResultType(
-      targetShape, inputType, offsets, sizes, strides);
+      getReducedShape(inputType.getShape()), inputType, offsets, sizes,
+      strides);
   return canonicalizeStridedLayout(cast<MemRefType>(rankReducedType));
 }
 
@@ -277,17 +285,18 @@ static Value rankReducingSubviewDroppingUnitDims(PatternRewriter &rewriter,
                                                  mlir::Location loc,
                                                  Value input) {
   MemRefType inputType = cast<MemRefType>(input.getType());
-  assert(inputType.hasStaticShape());
-  SmallVector<int64_t> subViewOffsets(inputType.getRank(), 0);
-  SmallVector<int64_t> subViewStrides(inputType.getRank(), 1);
-  ArrayRef<int64_t> subViewSizes = inputType.getShape();
-  MemRefType resultType =
-      dropUnitDims(inputType, subViewOffsets, subViewSizes, subViewStrides);
+  SmallVector<OpFoldResult> offsets(inputType.getRank(),
+                                    rewriter.getIndexAttr(0));
+  SmallVector<OpFoldResult> sizes = memref::getMixedSizes(rewriter, loc, input);
+  SmallVector<OpFoldResult> strides(inputType.getRank(),
+                                    rewriter.getIndexAttr(1));
+  MemRefType resultType = dropUnitDims(inputType, offsets, sizes, strides);
+
   if (canonicalizeStridedLayout(resultType) ==
       canonicalizeStridedLayout(inputType))
     return input;
-  return rewriter.create<memref::SubViewOp>(
-      loc, resultType, input, subViewOffsets, subViewSizes, subViewStrides);
+  return rewriter.create<memref::SubViewOp>(loc, resultType, input, offsets,
+                                            sizes, strides);
 }
 
 /// Returns the number of dims that aren't unit dims.
@@ -295,12 +304,18 @@ static int getReducedRank(ArrayRef<int64_t> shape) {
   return llvm::count_if(shape, [](int64_t dimSize) { return dimSize != 1; });
 }
 
-/// Returns a copy of `shape` without unit dims.
-static SmallVector<int64_t> getReducedShape(ArrayRef<int64_t> shape) {
-  SmallVector<int64_t> reducedShape;
-  llvm::copy_if(shape, std::back_inserter(reducedShape),
-                [](int64_t dimSize) { return dimSize != 1; });
-  return reducedShape;
+/// Trims non-scalable one dimensions from `oldType` and returns the result
+/// type.
+static VectorType trimUnitDims(VectorType oldType) {
+  SmallVector<int64_t> newShape;
+  SmallVector<bool> newScalableDims;
+  for (auto [dimIdx, dimSize] : llvm::enumerate(oldType.getShape())) {
+    if (dimSize == 1 && !oldType.getScalableDims()[dimIdx])
+      continue;
+    newShape.push_back(dimSize);
+    newScalableDims.push_back(oldType.getScalableDims()[dimIdx]);
+  }
+  return VectorType::get(newShape, oldType.getElementType(), newScalableDims);
 }
 
 namespace {
@@ -320,9 +335,7 @@ class TransferReadDropUnitDimsPattern
     Value source = transferReadOp.getSource();
     MemRefType sourceType = dyn_cast<MemRefType>(source.getType());
     // TODO: support tensor types.
-    if (!sourceType || !sourceType.hasStaticShape())
-      return failure();
-    if (sourceType.getNumElements() != vectorType.getNumElements())
+    if (!sourceType)
       return failure();
     // TODO: generalize this pattern, relax the requirements here.
     if (transferReadOp.hasOutOfBoundsDim())
@@ -335,23 +348,50 @@ class TransferReadDropUnitDimsPattern
       return failure();
     // Check if the reduced vector shape matches the reduced source shape.
     // Otherwise, this case is not supported yet.
-    int vectorReducedRank = getReducedRank(vectorType.getShape());
-    if (reducedRank != vectorReducedRank)
+    auto reducedVectorType = trimUnitDims(vectorType);
+    if (reducedRank != reducedVectorType.getRank())
       return failure();
     if (llvm::any_of(transferReadOp.getIndices(), [](Value v) {
           return getConstantIntValue(v) != static_cast<int64_t>(0);
         }))
       return failure();
+
+    auto maskOp = transferReadOp.getMask();
+    if (maskOp) {
+      auto createMaskOp = maskOp.getDefiningOp<vector::CreateMaskOp>();
+      if (!createMaskOp)
+        return failure();
+      auto maskType = maskOp.getType();
+      auto reducedMaskType = trimUnitDims(maskType);
+      if (reducedMaskType.getRank() == maskType.getRank())
+        return failure();
+      SmallVector<Value> maskOperands;
+      for (auto [dim, dimIsScalable, maskOperand] :
+           llvm::zip(maskType.getShape(), maskType.getScalableDims(),
+                     createMaskOp.getOperands())) {
+        if (dim == 1 && !dimIsScalable) {
+          // If the mask for the unit dim is not a constant of 1, do nothing.
+          auto constant = maskOperand.getDefiningOp<arith::ConstantIndexOp>();
+          if (!constant || (constant.value() != 1))
+            return failure();
+          continue;
+        }
+        maskOperands.push_back(maskOperand);
+      }
+      maskOp = rewriter.create<vector::CreateMaskOp>(loc, reducedMaskType,
+                                                     maskOperands);
+    }
+
     Value reducedShapeSource =
         rankReducingSubviewDroppingUnitDims(rewriter, loc, source);
     Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     SmallVector<Value> zeros(reducedRank, c0);
     auto identityMap = rewriter.getMultiDimIdentityMap(reducedRank);
-    auto reducedVectorType = VectorType::get(
-        getReducedShape(vectorType.getShape()), vectorType.getElementType());
-
+    SmallVector<bool> inBounds(reducedVectorType.getRank(), true);
     auto newTransferReadOp = rewriter.create<vector::TransferReadOp>(
-        loc, reducedVectorType, reducedShapeSource, zeros, identityMap);
+        loc, reducedVectorType, reducedShapeSource, zeros, identityMap,
+        transferReadOp.getPadding(), maskOp,
+        rewriter.getBoolArrayAttr(inBounds));
     auto shapeCast = rewriter.createOrFold<vector::ShapeCastOp>(
         loc, vectorType, newTransferReadOp);
     rewriter.replaceOp(transferReadOp, shapeCast);
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
index 2852e301888cca8..688fcd114041812 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -82,6 +82,92 @@ func.func @transfer_write_and_vector_rank_reducing_to_0d(
 //       CHECK:   %[[SHCAST:.+]] = vector.shape_cast %[[VECTOR]] : vector<1x1x1xf32> to vector<f32>
 //       CHECK:   vector.transfer_write %[[SHCAST]], %[[SUBVIEW]]{{.*}} : vector<f32>, memref<f32>
 
+func.func @transfer_read_dynamic_rank_reducing(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>) -> vector<[16]x1xi8> {
+    %c0 = arith.constant 0 : index
+    %pad = arith.constant 0 : i8
+    %v = vector.transfer_read %arg[%c0, %c0], %pad {in_bounds = [true, true]} :
+      memref<?x1xi8, strided<[?, ?], offset: ?>>, vector<[16]x1xi8>
+    return %v : vector<[16]x1xi8>
+}
+// CHECK-LABEL: func @transfer_read_dynamic_rank_reducing
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//       CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   %[[DIM0:.+]] = memref.dim %[[ARG]], %[[C0]] : memref<?x1xi8, strided<[?, ?], offset: ?>>
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG]][0, 0] [%[[DIM0]], 1] [1, 1] : memref<?x1xi8, {{.*}}> to memref<?xi8, {{.*}}>
+//       CHECK:   vector.transfer_read %[[SUBVIEW]]{{.*}} : memref<?xi8, {{.*}}>, vector<[16]xi8>
+
+func.func @masked_transfer_read_dynamic_rank_reducing(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,
+      %mask_dim0 : index) -> vector<[16]x1xi8> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %pad = arith.constant 0 : i8
+    %mask = vector.create_mask %mask_dim0, %c1 : vector<[16]x1xi1>
+    %v = vector.transfer_read %arg[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
+      memref<?x1xi8, strided<[?, ?], offset: ?>>, vector<[16]x1xi8>
+    return %v : vector<[16]x1xi8>
+}
+// CHECK-LABEL: func @masked_transfer_read_dynamic_rank_reducing
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//  CHECK-SAME:     %[[MASK_DIM0:.+]]: index
+//       CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   %[[PAD:.+]] = arith.constant 0 : i8
+//       CHECK:   %[[MASK:.+]] = vector.create_mask %[[MASK_DIM0]] : vector<[16]xi1>
+//       CHECK:   %[[DIM0:.+]] = memref.dim %[[ARG]], %[[C0]] : memref<?x1xi8, strided<[?, ?], offset: ?>>
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG]][0, 0] [%[[DIM0]], 1] [1, 1] : memref<?x1xi8, {{.*}}> to memref<?xi8, {{.*}}>
+//       CHECK:   vector.transfer_read %[[SUBVIEW]][{{.*}}], %[[PAD]], %[[MASK]] {in_bounds = [true]} : memref<?xi8, {{.*}}>, vector<[16]xi8>
+
+/// Only vector.create_mask is currently supported.
+func.func @unsupported_masked_transfer_read_dynamic_rank_reducing_1(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,
+      %mask : vector<[16]x1xi1>) -> vector<[16]x1xi8> {
+    %c0 = arith.constant 0 : index
+    %pad = arith.constant 0 : i8
+    %v = vector.transfer_read %arg[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
+      memref<?x1xi8, strided<[?, ?], offset: ?>>, vector<[16]x1xi8>
+    return %v : vector<[16]x1xi8>
+}
+// CHECK-LABEL: func @unsupported_masked_transfer_read_dynamic_rank_reducing_1
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//   CHECK-NOT: vector.create_mask
+//   CHECK-NOT: memref.subview
+//       CHECK: vector.transfer_read %[[ARG]]
+
+/// Unit dim mask must be constant of 1.
+func.func @unsupported_masked_transfer_read_dynamic_rank_reducing_2(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,
+      %mask_dim0 : index, %mask_dim1 : index) -> vector<[16]x1xi8> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %pad = arith.constant 0 : i8
+    %mask = vector.create_mask %mask_dim0, %mask_dim1 : vector<[16]x1xi1>
+    %v = vector.transfer_read %arg[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
+      memref<?x1xi8, strided<[?, ?], offset: ?>>, vector<[16]x1xi8>
+    return %v : vector<[16]x1xi8>
+}
+// CHECK-LABEL: func @unsupported_masked_transfer_read_dynamic_rank_reducing_2
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//   CHECK-NOT: memref.subview
+//       CHECK: vector.transfer_read {{.*}} vector<[16]x1xi8>
+
+/// Unit dim must be non-scalable.
+func.func @masked_transfer_read_dynamic_rank_reducing_scalable_unit_dim(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,
+      %mask_dim0 : index) -> vector<[16]x[1]xi8> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %pad = arith.constant 0 : i8
+    %mask = vector.create_mask %mask_dim0, %c1 : vector<[16]x[1]xi1>
+    %v = vector.transfer_read %arg[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
+      memref<?x1xi8, strided<[?, ?], offset: ?>>, vector<[16]x[1]xi8>
+    return %v : vector<[16]x[1]xi8>
+}
+// CHECK-LABEL: func @masked_transfer_read_dynamic_rank_reducing_scalable_unit_dim
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//   CHECK-NOT: memref.subview
+//       CHECK: vector.transfer_read {{.*}} vector<[16]x[1]xi8>
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
     transform.apply_patterns to %func_op {

>From f51737e779c217d28cd53cc4cae9deb8687470bc Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Thu, 9 Nov 2023 19:44:34 +0000
Subject: [PATCH 2/3] [mlir][vector] Add vector.transpose with unit-dim to
 vector.shape_cast pattern

This patch extends the vector.transpose lowering to replace:

  vector.transpose %0, [1, 0] : vector<nx1x<eltty>> to vector<1xnx<eltty>>

with:

  vector.shape_cast %0 : vector<nx1x<eltty>> to vector<1xnx<eltty>>

Source with leading unit-dim (inverse) is also replaced. Unit dim must
be fixed. Non-unit dim can be scalable.

A check is also added to bail out for scalable vectors before unrolling.
---
 .../Transforms/LowerVectorTranspose.cpp       | 21 ++++++
 .../Vector/vector-transpose-lowering.mlir     | 71 +++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
index 7d804ddcfa42ffe..dee786007c80630 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -336,6 +336,27 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
       return rewriter.notifyMatchFailure(
           op, "Options specifies lowering to shuffle");
 
+    // Replace:
+    //   vector.transpose %0, [1, 0] : vector<nx1x<eltty>> to
+    //                                 vector<1xnxelty>
+    // with:
+    //   vector.shape_cast %0 : vector<nx1x<eltty>> to vector<1xnxelty>
+    //
+    // Source with leading unit dim (inverse) is also replaced. Unit dim must
+    // be fixed. Non-unit can be scalable.
+    if (resType.getRank() == 2 &&
+        ((resType.getShape().front() == 1 &&
+          !resType.getScalableDims().front()) ||
+         (resType.getShape().back() == 1 &&
+          !resType.getScalableDims().back())) &&
+        transp == ArrayRef<int64_t>({1, 0})) {
+      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, resType, input);
+      return success();
+    }
+
+    if (inputType.isScalable())
+      return failure();
+
     // Handle a true 2-D matrix transpose differently when requested.
     if (vectorTransformOptions.vectorTransposeLowering ==
             vector::VectorTransposeLowering::Flat &&
diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
index 22d9224838c49c4..c0b44428d5bcf30 100644
--- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
@@ -74,6 +74,17 @@ func.func @transpose1023_1x1x8x8xf32(%arg0: vector<1x1x8x8xf32>) -> vector<1x1x8
   return %0 : vector<1x1x8x8xf32>
 }
 
+/// Scalable dim should not be unrolled.
+
+// CHECK-LABEL: func @transpose23_scalable
+// CHECK-NOT: vector.extract
+// CHECK-NOT: vector.insert
+// CHECK: vector.transpose
+func.func @transpose23_scalable(%arg0: vector<2x[3]xf32>) -> vector<[3]x2xf32> {
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x[3]xf32> to vector<[3]x2xf32>
+  return %0 : vector<[3]x2xf32>
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
     transform.apply_patterns to %func_op {
@@ -778,3 +789,63 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+/// Transpose of rank-2 vector with leading or trailing unit dim to shape_cast.
+
+// CHECK-LABEL: func @transpose10_4x1xf32
+func.func @transpose10_4x1xf32(%arg0: vector<4x1xf32>) -> vector<1x4xf32> {
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<4x1xf32> to vector<1x4xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
+  return %0 : vector<1x4xf32>
+}
+
+// CHECK-LABEL: func @transpose10_nx4x1xf32
+func.func @transpose10_nx4x1xf32(%arg0: vector<[4]x1xf32>) -> vector<1x[4]xf32> {
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<[4]x1xf32> to vector<1x[4]xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
+  return %0 : vector<1x[4]xf32>
+}
+
+// CHECK-LABEL: func @transpose10_1x4xf32
+func.func @transpose10_1x4xf32(%arg0: vector<1x4xf32>) -> vector<4x1xf32> {
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<1x4xf32> to vector<4x1xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
+  return %0 : vector<4x1xf32>
+}
+
+// CHECK-LABEL: func @transpose10_1xnx4xf32
+func.func @transpose10_1xnx4xf32(%arg0: vector<1x[4]xf32>) -> vector<[4]x1xf32> {
+  // CHECK-NEXT: vector.shape_cast %{{.*}} : vector<1x[4]xf32> to vector<[4]x1xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<1x[4]xf32> to vector<[4]x1xf32>
+  return %0 : vector<[4]x1xf32>
+}
+
+/// Scalable unit dim should not be lowered to shape_cast.
+
+// CHECK-LABEL: func @transpose10_4xnx1xf32
+func.func @transpose10_4xnx1xf32(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32> {
+  // CHECK-NOT: vector.shape_cast
+  // CHECK: vector.transpose %{{.*}} : vector<4x[1]xf32> to vector<[1]x4xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<4x[1]xf32> to vector<[1]x4xf32>
+  return %0 : vector<[1]x4xf32>
+}
+
+// CHECK-LABEL: func @transpose10_nx4xnx1xf32
+func.func @transpose10_nx4xnx1xf32(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32> {
+  // CHECK-NOT: vector.shape_cast
+  // CHECK: vector.transpose %{{.*}} : vector<4x[1]xf32> to vector<[1]x4xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<4x[1]xf32> to vector<[1]x4xf32>
+
+  return %0 : vector<[1]x4xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.vector.lower_transpose
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}

>From 714023db232dbd1d9933a8bdc4c07a58163707ba Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Thu, 9 Nov 2023 19:44:34 +0000
Subject: [PATCH 3/3] [mlir][linalg] Add an e2e test for linalg.matmul to
 ArmSME

This patch adds an integration test lowering a linalg.matmul to SME via
vector.outerproduct.

It's similar to the linalg.matmul_transpose_a e2e test added recently in

  * vector.rank_reducing_subview_patterns
  * vector.lower_transpose

To lower the following sequence (taken from the inner loop):

  %subview = memref.subview %arg0[%arg3, %arg5] [%2, 1] [1, 1] :
    memref<?x?xf32, strided<[?, ?], offset: ?>> to memref<?x1xf32, strided<[?, ?], offset: ?>>
  %mask = vector.create_mask %2, %c1 : vector<[4]x1xi1>
  %0 = vector.transfer_read %subview[%c0, %c0], %pad, %mask {in_bounds = [true, true]} :
    memref<?x1xf32, strided<[?, ?], offset: ?>>, vector<[4]x1xf32>
  %1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
  %2 = vector.extract %1[0] : vector<[4]xf32> from vector<1x[4]xf32>

such that the unit dim is dropped and the vector.transfer_read can be
lowered via the generic path. Rank-2 vectors with leading scalable dim
can't be type converted to an array.
---
 .../Dialect/Linalg/CPU/ArmSME/matmul.mlir     | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
new file mode 100644
index 000000000000000..8c08263ab756be2
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
@@ -0,0 +1,104 @@
+// RUN: mlir-opt %s \
+// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
+// RUN:   -canonicalize -enable-arm-streaming="mode=locally enable-za" \
+// RUN:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
+// RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
+// RUN:   -convert-vector-to-llvm=enable-arm-sme \
+// RUN:   -convert-vector-to-llvm=enable-arm-sve \
+// RUN:   -cse -canonicalize -allocate-arm-sme-tiles -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd \
+// RUN:   -e=main -entry-point-result=void \
+// RUN:   -march=aarch64 -mattr="+sve,+sme" \
+// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @matmul(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>, %C : tensor<?x?xf32>) {
+  %res = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
+                       outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
+  %xf = tensor.cast %res : tensor<?x?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+  return
+}
+
+func.func @main() {
+  %c0 = arith.constant 0 : i32
+  %c7 = arith.constant 7 : index
+
+  %A = arith.constant dense<[
+    [ 1.,  8., 15., 22., 29., 36., 43., 50., 57., 64., 71., 78., 85.],
+    [ 2.,  9., 16., 23., 30., 37., 44., 51., 58., 65., 72., 79., 86.],
+    [ 3., 10., 17., 24., 31., 38., 45., 52., 59., 66., 73., 80., 87.],
+    [ 4., 11., 18., 25., 32., 39., 46., 53., 60., 67., 74., 81., 88.],
+    [ 5., 12., 19., 26., 33., 40., 47., 54., 61., 68., 75., 82., 89.],
+    [ 6., 13., 20., 27., 34., 41., 48., 55., 62., 69., 76., 83., 90.],
+    [ 7., 14., 21., 28., 35., 42., 49., 56., 63., 70., 77., 84., 91.]
+  ]> : tensor<7x13xf32>
+
+  %B_init = tensor.empty() : tensor<13x7xf32>
+  %B = linalg.transpose ins(%A: tensor<7x13xf32>)
+                        outs(%B_init: tensor<13x7xf32>) permutation = [1, 0]
+
+  %A_dyn = tensor.cast %A : tensor<7x13xf32> to tensor<?x?xf32>
+  %B_dyn = tensor.cast %B : tensor<13x7xf32> to tensor<?x?xf32>
+
+  %C_init = bufferization.alloc_tensor(%c7, %c7) : tensor<?x?xf32>
+  %C = linalg.fill ins(%c0 : i32) outs(%C_init : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  // CHECK: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [7, 7] strides = [7, 1] data =
+  // CHECK: [32955, 33514, 34073, 34632, 35191, 35750, 36309]
+  // CHECK: [33514, 34086, 34658, 35230, 35802, 36374, 36946]
+  // CHECK: [34073, 34658, 35243, 35828, 36413, 36998, 37583]
+  // CHECK: [34632, 35230, 35828, 36426, 37024, 37622, 38220]
+  // CHECK: [35191, 35802, 36413, 37024, 37635, 38246, 38857]
+  // CHECK: [35750, 36374, 36998, 37622, 38246, 38870, 39494]
+  // CHECK: [36309, 36946, 37583, 38220, 38857, 39494, 40131]
+  call @matmul(%A_dyn, %B_dyn, %C) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> ()
+
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module : !transform.any_op {transform.consumed}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+
+    // Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where
+    // SVLs is the number of 32-bit elements in a vector of SVL bits.
+    %tiled_linalg_op, %loops:3 = transform.structured.tile_using_for %matmul[[4], [4], 1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize.
+    transform.structured.vectorize %tiled_linalg_op vector_sizes [[4], [4], 1]
+      : !transform.any_op
+
+    // Step 3: Bufferize ahead of TransferReadDropUnitDimsPattern, which
+    // currently only supports memrefs.
+    %bufferize = transform.bufferization.one_shot_bufferize %module
+      {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
+
+    %func = transform.structured.match ops{["func.func"]} in %bufferize
+      : (!transform.any_op) -> !transform.any_op
+
+    // Step 4: Lower vector.multi_reduction to vector.contract (+ some helpful patterns).
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.transfer_permutation_patterns
+      transform.apply_patterns.vector.reduction_to_contract
+    } : !transform.any_op
+
+    // Step 5: Lower vector.contract to vector.outerproduct. Also drop unit
+    // dims, specifically to prevent vector.transfer_read of vector<[4]x1xf32>,
+    // which can't be lowered in generic path. Lower transpose converts
+    // transpose with unit dims to shape_cast that can be canonicalized.
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+      transform.apply_patterns.vector.lower_masks
+      transform.apply_patterns.vector.rank_reducing_subview_patterns
+      transform.apply_patterns.vector.lower_transpose
+    } : !transform.any_op
+
+    transform.yield
+  }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)