[Mlir-commits] [mlir] [mlir][vector] Update tests for collapse 2/n (nfc) (PR #94604)

Sun Jun 9 07:53:41 PDT 2024

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/94604

>From 48c614fb8ece5a45b998c9b4206aeb80ec5d2d8b Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 5 Jun 2024 16:39:15 +0100
Subject: [PATCH 1/3] [mlir][vector] Update tests for collapse 1/n (nfc)

The main goal of this PR (and subsequent PRs), is to add more tests with
scalable vectors to:
  * vector-transfer-collapse-inner-most-dims.mlir

There's quite a few cases to consider, hence this is split into multiple
PRs. In this PR, the very first test is complemented with all the possible
combinations:
  * scalable (rather than fixed) unit trailing dim,
  * dynamic (rather than static) trailing dim in the source memref.

Also,
  * @leading_scalable_dimension_transfer_read and
    @trailing_scalable_one_dim_transfer_read,

are replaced with:
  * @contiguous_inner_most_scalable_inner_dim and
    @negative_scalable_unit_dim,

respectively, and added to the list above (i.e. alongside other
variations for the very first test).

In addition:
  * "_view" is removed from function names (it's not clear to me what it
    was meant to signify)
  * extra comments are added to separate tests for vector.transfer_read
    and vector.transfer_write

NOTE: This PR is limited to tests for `vector.transfer_read`.
---
 ...tor-transfer-collapse-inner-most-dims.mlir | 91 ++++++++++++-------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index b4cb640108bae..0123709004bd6 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -1,12 +1,17 @@
 // RUN: mlir-opt %s -test-vector-transfer-collapse-inner-most-dims -split-input-file | FileCheck %s
 
-func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
+//-----------------------------------------------------------------------------
+// 1. vector.transfer_read
+//-----------------------------------------------------------------------------
+
+func.func @contiguous_inner_most(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
   %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
   return %0 : vector<1x8x1xf32>
 }
-//      CHECK: func @contiguous_inner_most_view(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
+
+//      CHECK: func @contiguous_inner_most(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
 //      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
 // CHECK-SAME:    memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>
 //      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[SRC_0]]
@@ -14,15 +19,61 @@ func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8,
 //      CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
 //      CHECK:   return %[[RESULT]]
 
+// Same as the top example within this split, but with the inner vector
+// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_scalable_inner_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x[8]x1xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x[8]x1xf32>
+  return %0 : vector<1x[8]x1xf32>
+}
+
+//      CHECK: func @contiguous_inner_most_scalable_inner_dim(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
+//      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
+// CHECK-SAME:    memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>
+//      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[SRC_0]]
+// CHECK-SAME:    memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>, vector<1x[8]xf32>
+//      CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
+//      CHECK:   return %[[RESULT]]
+
+// Same as the top example within this split, but the trailing unit dim was
+// replaced with a dyn dim - not supported
+
+func.func @non_unit_trailing_dim(%in: memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
+  return %0 : vector<1x8x1xf32>
+}
+
+//  CHECK-LABEL: func @non_unit_trailing_dim
+//    CHECK-NOT: memref.subview
+//    CHECK-NOT: vector.shape_cast
+
+// Same as the top example within this split, but with a scalable unit dim in
+// the output vector - not supported
+
+func.func @negative_scalable_unit_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x[1]xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32>
+  return %0 : vector<1x8x[1]xf32>
+}
+//  CHECK-LABEL: func @scalable_unit_dim
+//    CHECK-NOT: memref.subview
+//    CHECK-NOT: vector.shape_cast
+
 // -----
 
-func.func @contiguous_outer_dyn_inner_most_view(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
+func.func @contiguous_outer_dyn_inner_most(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f32
   %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x8x1xf32>, vector<8x1xf32>
   return %v : vector<8x1xf32>
 }
-// CHECK: func.func @contiguous_outer_dyn_inner_most_view(
+// CHECK: func.func @contiguous_outer_dyn_inner_most(
 // CHECK-SAME:   %[[IDX0:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[IDX1:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
@@ -103,6 +154,10 @@ func.func @contiguous_inner_most_dim_out_of_bounds_2d(%arg0: memref<1x1xf32>) ->
 
 // -----
 
+//-----------------------------------------------------------------------------
+// 2. vector.transfer_write
+//-----------------------------------------------------------------------------
+
 func.func @drop_two_inner_most_dim_for_transfer_write(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) {
   %c0 = arith.constant 0 : index
   vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0]
@@ -177,21 +232,6 @@ func.func @non_unit_strides(%arg0: memref<512x16x1xf32, strided<[8192, 16, 4], o
 
 // -----
 
-func.func @leading_scalable_dimension_transfer_read(%dest : memref<24x1xf32>) -> vector<[4]x1xf32> {
-  %c0 = arith.constant 0 : index
-  %pad = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %dest[%c0, %c0], %pad {in_bounds = [true, true]} : memref<24x1xf32>, vector<[4]x1xf32>
-  return %0 : vector<[4]x1xf32>
-}
-// CHECK:      func.func @leading_scalable_dimension_transfer_read
-// CHECK-SAME:   %[[DEST:[a-zA-Z0-9]+]]
-// CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0] [24, 1] [1, 1] : memref<24x1xf32> to memref<24xf32, strided<[1]>>
-// CHECK:        %[[READ:.+]] = vector.transfer_read %[[SUBVIEW]]{{.*}} {in_bounds = [true]} : memref<24xf32, strided<[1]>>, vector<[4]xf32>
-// CHECK:        %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<[4]xf32> to vector<[4]x1xf32>
-// CHECK:        return %[[CAST]]
-
-// -----
-
 // Negative test: [1] (scalable 1) is _not_ a unit dimension.
 func.func @trailing_scalable_one_dim_transfer_read(%dest : memref<24x1xf32>) -> vector<4x[1]xf32> {
   %c0 = arith.constant 0 : index
@@ -217,16 +257,3 @@ func.func @leading_scalable_dimension_transfer_write(%dest : memref<24x1xf32>, %
 // CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0] [24, 1] [1, 1] : memref<24x1xf32> to memref<24xf32, strided<[1]>>
 // CHECK:        %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<[4]x1xf32> to vector<[4]xf32>
 // CHECK:        vector.transfer_write %[[CAST]], %[[SUBVIEW]]{{.*}} {in_bounds = [true]} : vector<[4]xf32>, memref<24xf32, strided<[1]>>
-
-// -----
-
-// Negative test: [1] (scalable 1) is _not_ a unit dimension.
-func.func @trailing_scalable_one_dim_transfer_write(%dest : memref<24x1xf32>, %vec: vector<4x[1]xf32>, %index: index) {
-  %c0 = arith.constant 0 : index
-  vector.transfer_write %vec, %dest[%index, %c0] {in_bounds = [true, true]} : vector<4x[1]xf32>,  memref<24x1xf32>
-  return
-}
-// CHECK:      func.func @trailing_scalable_one_dim_transfer_write
-// CHECK-NOT:    vector.shape_cast
-// CHECK:        vector.transfer_write {{.*}} : vector<4x[1]xf32>,  memref<24x1xf32>
-// CHECK-NOT:    vector.shape_cast

>From e9731a0a4ec6e26148c47895ed64a097e7090b3c Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 6 Jun 2024 18:07:09 +0100
Subject: [PATCH 2/3] fixup! [mlir][vector] Update tests for collapse 1/n (nfc)

Fix failing test
---
 .../vector-transfer-collapse-inner-most-dims.mlir | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index 0123709004bd6..9b23681dba6a8 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -61,7 +61,7 @@ func.func @negative_scalable_unit_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8,
   %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32>
   return %0 : vector<1x8x[1]xf32>
 }
-//  CHECK-LABEL: func @scalable_unit_dim
+//  CHECK-LABEL: func @negative_scalable_unit_dim
 //    CHECK-NOT: memref.subview
 //    CHECK-NOT: vector.shape_cast
 
@@ -257,3 +257,16 @@ func.func @leading_scalable_dimension_transfer_write(%dest : memref<24x1xf32>, %
 // CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0] [24, 1] [1, 1] : memref<24x1xf32> to memref<24xf32, strided<[1]>>
 // CHECK:        %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<[4]x1xf32> to vector<[4]xf32>
 // CHECK:        vector.transfer_write %[[CAST]], %[[SUBVIEW]]{{.*}} {in_bounds = [true]} : vector<[4]xf32>, memref<24xf32, strided<[1]>>
+
+// -----
+
+// Negative test: [1] (scalable 1) is _not_ a unit dimension.
+func.func @trailing_scalable_one_dim_transfer_write(%dest : memref<24x1xf32>, %vec: vector<4x[1]xf32>, %index: index) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %dest[%index, %c0] {in_bounds = [true, true]} : vector<4x[1]xf32>,  memref<24x1xf32>
+  return
+}
+// CHECK:      func.func @trailing_scalable_one_dim_transfer_write
+// CHECK-NOT:    vector.shape_cast
+// CHECK:        vector.transfer_write {{.*}} : vector<4x[1]xf32>,  memref<24x1xf32>
+// CHECK-NOT:    vector.shape_cast

>From 0a64da6b977c06d5e91c3e6bafee24277247df68 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 6 Jun 2024 11:21:45 +0100
Subject: [PATCH 3/3] [mlir][vector] Update tests for collapse 2/n (nfc)

The main goal of this PR (and subsequent PRs), is to add more tests with
scalable vectors to:
  * vector-transfer-collapse-inner-most-dims.mlir

Changes in this PR:

1. Renamed `@contiguous_inner_most_dim_bounds` as
   `@contiguous_inner_most_dim_with_subview`. This test was introduced
   to make sure that the `in_bounds` attribute is correctly preserved,
   but that's already verified by some earlier tests. The updated name
   highlights the differentiating factor of this test when compared to
   the other tests currently present in the file (i.e. the presence of
   `memref.subview` in the input IR).

3. Updated test names to make them more consistent with the other tests
   and to highlight the unique features that are being tested (e.g.
   `@contiguous_inner_most_dim` ->
   `@contiguous_inner_most_dim_non_zero_idxs` for a test

2. Added more tests for scalable vectors - this should cover all cases
   for `vector.transfer_read`.

NOTE: This PR is limited to tests for `vector.transfer_read`.

Follow-up for: #94490
---
 ...tor-transfer-collapse-inner-most-dims.mlir | 116 +++++++++++++++---
 1 file changed, 100 insertions(+), 16 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index 9b23681dba6a8..60c80c2d8eb96 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -67,13 +67,13 @@ func.func @negative_scalable_unit_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8,
 
 // -----
 
-func.func @contiguous_outer_dyn_inner_most(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
+func.func @contiguous_inner_most_dynamic_outer(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f32
   %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x8x1xf32>, vector<8x1xf32>
   return %v : vector<8x1xf32>
 }
-// CHECK: func.func @contiguous_outer_dyn_inner_most(
+// CHECK: func.func @contiguous_inner_most_dynamic_outer
 // CHECK-SAME:   %[[IDX0:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[IDX1:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
@@ -89,68 +89,152 @@ func.func @contiguous_outer_dyn_inner_most(%a: index, %b: index, %memref: memref
 // CHECK:        %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
 // CHECK:        return %[[RESULT]]
 
+// Same as the top example within this split, but with the outer vector
+// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<[8]x1xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 0.0 : f32
+  %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x8x1xf32>, vector<[8]x1xf32>
+  return %v : vector<[8]x1xf32>
+}
+// CHECK-LABEL:  func @contiguous_inner_most_outer_dim_dyn_scalable_inner_dim
+// CHECK-SAME:   %[[IDX0:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[IDX1:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
+// CHECK:         %[[VIEW:.+]] = memref.subview %[[SRC]]{{.*}} memref<?x?x8x1xf32> to memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>
+// CHECK:         %[[VEC_READ:.+]] = vector.transfer_read %[[VIEW]]
+// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:     memref<?x?x8xf32, strided<[?, 8, 1], offset: ?>>, vector<[8]xf32>
+// CHECK:         vector.shape_cast %[[VEC_READ]]
+
 // -----
 
-func.func @contiguous_inner_most_dim(%A: memref<16x1xf32>, %i:index, %j:index) -> (vector<8x1xf32>) {
+func.func @contiguous_inner_most_dim_non_zero_idxs(%A: memref<16x1xf32>, %i:index, %j:index) -> (vector<8x1xf32>) {
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
   %1 = vector.transfer_read %A[%i, %j], %f0 : memref<16x1xf32>, vector<8x1xf32>
   return %1 : vector<8x1xf32>
 }
-//      CHECK: func @contiguous_inner_most_dim(%[[SRC:.+]]: memref<16x1xf32>, %[[I:.+]]: index, %[[J:.+]]: index) -> vector<8x1xf32>
+//      CHECK: func @contiguous_inner_most_dim_non_zero_idxs(%[[SRC:.+]]: memref<16x1xf32>, %[[I:.+]]: index, %[[J:.+]]: index) -> vector<8x1xf32>
 //      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
 // CHECK-SAME:     memref<16x1xf32> to memref<16xf32, strided<[1]>>
 //      CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_0]]
-//      CHECK:   %[[RESULT]] = vector.shape_cast %[[V]] : vector<8xf32> to vector<8x1xf32>
+//      CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[V]] : vector<8xf32> to vector<8x1xf32>
 //      CHECK:   return %[[RESULT]]
 
+// Same as the top example within this split, but with the outer vector
+// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_dim_non_zero_idxs_scalable_inner_dim(%A: memref<16x1xf32>, %i:index, %j:index) -> (vector<[8]x1xf32>) {
+  %c0 = arith.constant 0 : index
+  %f0 = arith.constant 0.0 : f32
+  %1 = vector.transfer_read %A[%i, %j], %f0 : memref<16x1xf32>, vector<[8]x1xf32>
+  return %1 : vector<[8]x1xf32>
+}
+// CHECK-LABEL: func @contiguous_inner_most_dim_non_zero_idxs_scalable_inner_dim(
+// CHECK-SAME:    %[[SRC:.+]]: memref<16x1xf32>, %[[I:.+]]: index, %[[J:.+]]: index) -> vector<[8]x1xf32>
+//       CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
+//  CHECK-SAME:     memref<16x1xf32> to memref<16xf32, strided<[1]>>
+//       CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_0]]
+//       CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[V]] : vector<[8]xf32> to vector<[8]x1xf32>
+//       CHECK:   return %[[RESULT]]
+
 // -----
 
-func.func @contiguous_inner_most_dim_bounds(%A: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<4x1xf32>) {
+func.func @contiguous_inner_most_dim_with_subview(%A: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<4x1xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
   %0 = memref.subview %A[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>>
   %1 = vector.transfer_read %0[%ii, %c0], %cst {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<4x1xf32>
   return %1 : vector<4x1xf32>
 }
-//      CHECK: func @contiguous_inner_most_dim_bounds(%[[SRC:.+]]: memref<1000x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1xf32>
+//      CHECK: func @contiguous_inner_most_dim_with_subview(%[[SRC:.+]]: memref<1000x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1xf32>
 //      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
 //      CHECK:   %[[SRC_1:.+]] = memref.subview %[[SRC_0]]
 //      CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_1]]
 // CHECK-SAME:       {in_bounds = [true]}
 // CHECK-SAME:       vector<4xf32>
 
+// Same as the top example within this split, but with the outer vector
+// dim scalable. Note that this example only makes sense when "4 = [4]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_dim_with_subview_scalable_inner_dim(%A: memref<1000x1xf32>, %i:index, %ii:index) -> (vector<[4]x1xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = memref.subview %A[%i, 0] [40, 1] [1, 1] : memref<1000x1xf32> to memref<40x1xf32, strided<[1, 1], offset: ?>>
+  %1 = vector.transfer_read %0[%ii, %c0], %cst {in_bounds = [true, true]} : memref<40x1xf32, strided<[1, 1], offset: ?>>, vector<[4]x1xf32>
+  return %1 : vector<[4]x1xf32>
+}
+// CHECK-LABEL: func @contiguous_inner_most_dim_with_subview_scalable_inner_dim
+//  CHECK-SAME:   %[[SRC:.+]]: memref<1000x1xf32>
+//       CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
+//       CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_0]]
+//  CHECK-SAME:       {in_bounds = [true]}
+//  CHECK-SAME:       vector<[4]xf32>
+
 // -----
 
-func.func @contiguous_inner_most_dim_bounds_2d(%A: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<4x1x1xf32>) {
+func.func @contiguous_inner_most_dim_2d(%A: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<4x1x1xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
   %0 = memref.subview %A[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>
   %1 = vector.transfer_read %0[%ii, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<4x1x1xf32>
   return %1 : vector<4x1x1xf32>
 }
-//      CHECK: func @contiguous_inner_most_dim_bounds_2d(%[[SRC:.+]]: memref<1000x1x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1x1xf32>
+//      CHECK: func @contiguous_inner_most_dim_2d(%[[SRC:.+]]: memref<1000x1x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<4x1x1xf32>
 //      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
 //      CHECK:   %[[SRC_1:.+]] = memref.subview %[[SRC_0]]
 //      CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_1]]
 // CHECK-SAME:       {in_bounds = [true]}
 // CHECK-SAME:       vector<4xf32>
 
+// Same as the top example within this split, but with the outer vector
+// dim scalable. Note that this example only makes sense when "4 = [4]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_dim_2d_scalable_inner_dim(%A: memref<1000x1x1xf32>, %i:index, %ii:index) -> (vector<[4]x1x1xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = memref.subview %A[%i, 0, 0] [40, 1, 1] [1, 1, 1] : memref<1000x1x1xf32> to memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>
+  %1 = vector.transfer_read %0[%ii, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<40x1x1xf32, strided<[1, 1, 1], offset: ?>>, vector<[4]x1x1xf32>
+  return %1 : vector<[4]x1x1xf32>
+}
+// CHECK-LABEL: func @contiguous_inner_most_dim_2d_scalable_inner_dim(
+//  CHECK-SAME:   %[[SRC:.+]]: memref<1000x1x1xf32>, %[[II:.+]]: index, %[[J:.+]]: index) -> vector<[4]x1x1xf32>
+//       CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
+//       CHECK:   %[[SRC_1:.+]] = memref.subview %[[SRC_0]]
+//       CHECK:   %[[V:.+]] = vector.transfer_read %[[SRC_1]]
+//  CHECK-SAME:       {in_bounds = [true]}
+//  CHECK-SAME:       vector<[4]xf32>
+//       CHECK:  vector.shape_cast %[[V]]
+
 // -----
 
-func.func @contiguous_inner_most_dim_out_of_bounds_2d(%arg0: memref<1x1xf32>) -> vector<4x8xf32> {
+func.func @negative_non_unit_inner_vec_dim(%arg0: memref<1x1xf32>) -> vector<4x8xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = vector.transfer_read %arg0[%c0, %c0], %cst : memref<1x1xf32>, vector<4x8xf32>
   return %0 : vector<4x8xf32>
 }
-// The inner most unit dim can not be dropped. In this context, we do not
-// generate rank-reduced memref.subview ops.
-//      CHECK: func.func @contiguous_inner_most_dim_out_of_bounds_2d
-// CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
+//      CHECK: func.func @negative_non_unit_inner_vec_dim
+//  CHECK-NOT:   memref.subview
+//      CHECK:   vector.transfer_read
+
+// -----
+
+func.func @negative_non_unit_inner_memref_dim(%arg0: memref<1x8xf32>) -> vector<4x1xf32> {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cst : memref<1x8xf32>, vector<4x1xf32>
+  return %0 : vector<4x1xf32>
+}
+//      CHECK: func.func @negative_non_unit_inner_memref_dim
 //  CHECK-NOT:   memref.subview
-//      CHECK:   %[[READ:.+]] = vector.transfer_read %[[SRC]]
-//      CHECK:   return %[[READ]] : vector<4x8xf32>
+//      CHECK:   vector.transfer_read
 
 // -----