[Mlir-commits] [mlir] [mlir][nfc] Update vectorize-tensor-extract.mlir (3/N) (PR #119121)

Tue Dec 10 10:25:54 PST 2024

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/119121

>From 1e6c8b3085b397fedfdba36e568230ab40bde68f Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Sat, 7 Dec 2024 16:11:55 +0000
Subject: [PATCH 1/6] [mlir][linalg] Enable Vectorization of 0-D tensor.extract

This patch removes an assert in `vectorizeTensorExtract` that was blocking
the vectorization of 0-D tensor.extract operations, e.g.:

```mlir
  %1 = tensor.extract %src[] : tensor<f32>
```

As demonstrated by the included tests, this case is already effectively
supported.

**Context**
The removed assert was introduced in #109580 as a guard, pending proper support
and testing for 0-D tensors. This PR addresses that previously undocumented
TODO. Apologies for the oversight!

**Updates and Tests**
* Revised the existing test `@negative_no_loop` to ensure the
  `vectorize_nd_extract` attribute is included, allowing the vectorizer
  to process it. The test was renamed and variables updated for clarity.
* Added a new test `@extract_scalar_from_0d_into_1d` to cover "mixed"
  0-D/1-D tensor extraction, e.g.:
```mlir
  %res = linalg.generic {
    indexing_maps = [#map],
    iterator_types = ["parallel"]
  } outs(%init : tensor<1xf32>) {
  ^bb0(%in: f32):
    %1 = tensor.extract %src[] : tensor<f32>
    linalg.yield %1 : f32
  } -> tensor<1xf32>

  return %res : tensor<1xf32>
```

**Additional updates**
I also took the liberty and improved test coverage for 0-D tensor in the
vectorizer tests:
* Added a specific test for "0D linalg.generic" in
  "vectorization-with-patterns.mlir".
* Renamed several tests in "vectorization-with-patterns.mlir" to clarify
  that the 0-D case is now covered.
---
 .../Linalg/Transforms/Vectorization.cpp       |  5 --
 .../Linalg/vectorization-with-patterns.mlir   | 48 ++++++++++++++-
 .../Linalg/vectorize-tensor-extract.mlir      | 58 +++++++++++++++----
 3 files changed, 93 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e5c96b52acee23..863f2280e46ce6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1115,11 +1115,6 @@ vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state,
   //  b. contiguous loads.
   // Both cases use vector.transfer_read.
 
-  assert(llvm::count_if(resultType.getShape(),
-                        [](uint64_t dim) { return dim != 1; }) &&
-         "Contiguous loads and scalar loads + broadcast only support 1-D "
-         "vectors ATM!");
-
   // Collect indices for `vector.transfer_read`. At this point, the indices will
   // either be scalars or would have been broadcast to vectors matching the
   // result type. For indices that are vectors, there are two options:
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index 0c996bed996d3c..b688a677500c22 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -122,6 +122,48 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+#map = affine_map<() -> ()>
+
+// CHECK-LABEL:   func.func @generic_0d(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>, %[[ARG_2:.*]]: tensor<f32>)
+func.func @generic_0d(%arg0: tensor<f32>, %arg1: tensor<f32>,
+                      %arg2: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ_0:.*]] = vector.transfer_read %[[ARG_0]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           %[[ARG_0_AS_SCALAR:.*]] = vector.extract %[[READ_0]][] : f32 from vector<f32>
+// CHECK:           %[[READ_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           %[[ARG_1_AS_SCALAR:.*]] = vector.extract %[[READ_1]][] : f32 from vector<f32>
+// CHECK:           %[[READ_2:.*]] = vector.transfer_read %[[ARG_2]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           %[[ARG_2_AS_SCALAR:.*]] = vector.extract %[[READ_2]][] : f32 from vector<f32>
+// CHECK:           %[[MULF:.*]] = arith.mulf %[[ARG_0_AS_SCALAR]], %[[ARG_1_AS_SCALAR]] : f32
+// CHECK:           %[[ADDF:.*]] = arith.addf %[[ARG_2_AS_SCALAR]], %[[MULF]] : f32
+// CHECK:           %[[ADDF_BCAST:.*]] = vector.broadcast %[[ADDF]] : f32 to vector<f32>
+// CHECK:           vector.transfer_write %[[ADDF_BCAST]], %[[ARG_2]][] : vector<f32>, tensor<f32>
+  %res = linalg.generic {
+    indexing_maps = [#map, #map, #map],
+    iterator_types = []
+  } ins(%arg0, %arg1 : tensor<f32>, tensor<f32>)
+    outs(%arg2 : tensor<f32>) {
+  ^bb(%a: f32, %b: f32, %c: f32) :
+    %d = arith.mulf %a, %b: f32
+    %e = arith.addf %c, %d: f32
+    linalg.yield %e : f32
+  } -> tensor<f32>
+
+  return %res : tensor<f32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1  { disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 #matmul_transpose_out_trait = {
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
@@ -372,7 +414,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: func @test_vectorize_fill
-func.func @test_vectorize_fill_scalar(%A : memref<f32>, %arg0 : f32) {
+func.func @test_vectorize_fill_0d(%A : memref<f32>, %arg0 : f32) {
   // CHECK-SAME: (%[[M:.*]]: memref<f32>, %[[val:.*]]: f32)
   //      CHECK:   %[[VEC:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
   //      CHECK:   vector.transfer_write %[[VEC]], %[[M]][] : vector<f32>, memref<f32>
@@ -410,8 +452,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_copy_scalar
-func.func @test_vectorize_copy_scalar(%A : memref<f32>, %B : memref<f32>) {
+// CHECK-LABEL: func @test_vectorize_copy_0d
+func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
   //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
   //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
   //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 1a93d1cd9b7880..775ceed31be04a 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -39,29 +39,67 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 #map = affine_map<() -> ()>
-func.func @negative_no_loops(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  %1 = linalg.generic {
+func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>) -> tensor<f32> {
+  %res = linalg.generic {
     indexing_maps = [#map],
     iterator_types = []
-  } outs(%arg1 : tensor<f32>) {
-  ^bb0(%arg4: f32):
-    %2 = tensor.extract %arg0[] : tensor<f32>
-    linalg.yield %2 : f32
+  } outs(%init : tensor<f32>) {
+  ^bb0(%in: f32):
+    %1 = tensor.extract %src[] : tensor<f32>
+    linalg.yield %1 : f32
   } -> tensor<f32>
-  return %1 : tensor<f32>
+
+  return %res : tensor<f32>
 }
-// CHECK-LABEL: func.func @negative_no_loops
-// CHECK: tensor.extract
+
+// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_0d(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
 
+// -----
+
+#map = affine_map<(n) -> (n)>
+func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32>) -> tensor<1xf32> {
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel"]
+  } outs(%init : tensor<1xf32>) {
+  ^bb0(%in: f32):
+    %1 = tensor.extract %src[] : tensor<f32>
+    linalg.yield %1 : f32
+  } -> tensor<1xf32>
+
+  return %res : tensor<1xf32>
+}
+// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_1d(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
+// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
 
 // -----
 

>From 39d7b84e2485cd1fb63d9b91daba58d3bec18eb3 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 6 Dec 2024 13:56:24 +0000
Subject: [PATCH 2/6] [mlir][nfc] Update vectorize-tensor-extract.mlir (1/N)

Tests in "vectorize-tensor-extract.mlir" are inconsistent and would
benefit from refactoring to:

* Clearly categorize tests into "contiguous load," "gather load," and
  "scalar load + broadcast" cases, reflecting the structure of
  tensor.extract vectorization.
* Unify variable naming (both MLIR and FileCheck).
* Ensure all tests exercise unmasked vectorization (masked vectorization
  is covered in "vectorize-tensor-extract-masked.mlir").
* Improve and standardize formatting.

These changes will make it easier to identify the test cases being
exercised and simplify future maintenance or refactoring.

This is patch 1/N in the series. Below is a summary of the changes in
this patch.

----------------------------------------------------------------------

This PR updates the `@vectorize_scalar_broadcast_column_tensor` test in
"vectorize-tensor-extract.mlir", which exercises:

  * Vectorization of tensor.extract.
  * A scalar read followed by a broadcast.
  * Reading from a constant column tensor.

Currently, the test uses "masked" vectorization, but the file
exclusively tests unmasked vectorization paths. To address this
inconsistency, this PR removes masking, aligning the test with the rest
of the file. Masked vectorization scenarios remain covered in
"vectorize-tensor-extract-masked.mlir". This update switches from:

  * `transform.structured.vectorize`, to
  * `transform.structured.vectorize_children_and_apply_patterns`.

The latter approach applies canonicalization patterns, significantly
simplifying the generated output.

Additional improvements for readability:

  * Renamed the test function for clarity.
  * Updated variable names and removed unused variables.
  * Added empty lines for better formatting.
---
 .../Linalg/vectorize-tensor-extract.mlir      | 67 +++++++------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 775ceed31be04a..aa1e44166ec9d6 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -845,56 +845,41 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @vectorize_scalar_broadcast_column_tensor(%in: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
+func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
   %c4 = arith.constant 4 : index
   %c0 = arith.constant 0 : index
-  %cst = arith.constant dense<[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
-
-  %out = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%in : tensor<1x1x4xi32>) {
-  ^bb0(%out: i32):
-    %8 = linalg.index 0 : index
-    %idx_0 = linalg.index 0 : index
-    %extracted = tensor.extract %cst[%idx_0, %c0] : tensor<15x1xi32>
-    linalg.yield %extracted : i32
+  %src = arith.constant dense<[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
+
+  %res = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    outs(%init : tensor<1x1x4xi32>) {
+
+    ^bb0(%out: i32):
+      %idx = linalg.index 0 : index
+      %extracted = tensor.extract %src[%idx, %c0] : tensor<15x1xi32>
+      linalg.yield %extracted : i32
   } -> tensor<1x1x4xi32>
 
-  return %out:tensor<1x1x4xi32>
+  return %res : tensor<1x1x4xi32>
 }
 
-// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1) -> (0, 0, 0)>
-// CHECK-LABEL:   func.func @vectorize_scalar_broadcast_column_tensor(
-// CHECK-SAME:      %[[VAL_0:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 4 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 4 : index
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_9:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_7]], %[[VAL_7]]], %[[VAL_8]] : tensor<1x1x4xi32>, vector<1x1x4xi32>
-// CHECK:           %[[VAL_10:.*]] = vector.step : vector<1xindex>
-// CHECK:           %[[VAL_11:.*]] = vector.broadcast %[[VAL_10]] : vector<1xindex> to vector<4x1x1xindex>
-// CHECK:           %[[VAL_12:.*]] = vector.transpose %[[VAL_11]], [2, 1, 0] : vector<4x1x1xindex> to vector<1x1x4xindex>
-// CHECK:           %[[VAL_13:.*]] = vector.step : vector<1xindex>
-// CHECK:           %[[VAL_14:.*]] = vector.broadcast %[[VAL_13]] : vector<1xindex> to vector<4x1x1xindex>
-// CHECK:           %[[VAL_15:.*]] = vector.transpose %[[VAL_14]], [2, 1, 0] : vector<4x1x1xindex> to vector<1x1x4xindex>
-// CHECK:           %[[VAL_16:.*]] = arith.constant dense<true> : vector<1x1x4xi1>
-// CHECK:           %[[VAL_17:.*]] = arith.constant dense<0> : vector<1x1x4xi32>
-// CHECK:           %[[VAL_18:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_19:.*]] = vector.shape_cast %[[VAL_15]] : vector<1x1x4xindex> to vector<4xindex>
-// CHECK:           %[[VAL_20:.*]] = vector.extract %[[VAL_19]][0] : index from vector<4xindex>
-// CHECK:           %[[VAL_21:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_22:.*]] = vector.constant_mask [1] : vector<1xi1>
-// CHECK:           %[[VAL_23:.*]] = vector.mask %[[VAL_22]] { vector.transfer_read %[[VAL_3]]{{\[}}%[[VAL_20]], %[[VAL_2]]], %[[VAL_21]] {in_bounds = [true, true, true], permutation_map = #[[$MAP]]} : tensor<15x1xi32>, vector<1x1x4xi32> } : vector<1xi1> -> vector<1x1x4xi32>
-// CHECK:           %[[VAL_24:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_25:.*]] = vector.transfer_write %[[VAL_23]], %[[VAL_0]]{{\[}}%[[VAL_24]], %[[VAL_24]], %[[VAL_24]]] : vector<1x1x4xi32>, tensor<1x1x4xi32>
-// CHECK:           return %[[VAL_25]] : tensor<1x1x4xi32>
+// CHECK-LABEL:   func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(
+// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
+// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
+// CHECK:           %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0  vector_sizes [1, 1, 4]{ vectorize_nd_extract } : !transform.any_op
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }

>From bad48ea29ef6bc2fb15b7fb1fd726bd7b69f1804 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Sat, 7 Dec 2024 14:21:48 +0000
Subject: [PATCH 3/6] [mlir][nfc] Update vectorize-tensor-extract.mlir (2/N)

Tests in "vectorize-tensor-extract.mlir" are inconsistent and would
benefit from refactoring to:

* Clearly categorize tests into "contiguous load," "gather load," and
  "scalar load + broadcast" cases, reflecting the structure of
  tensor.extract vectorization.
* Unify variable naming (both MLIR and FileCheck).
* Ensure all tests exercise unmasked vectorization (masked vectorization
  is covered in "vectorize-tensor-extract-masked.mlir").
* Improve and standardize formatting.

These changes will make it easier to identify the test cases being
exercised and simplify future maintenance or refactoring.

This is patch 2/N in the series. Below is a summary of the changes in
this patch.

----------------------------------------------------------------------
Summary of patch 2/N
----------------------------------------------------------------------

Makes all tests re-use the same TD sequence. This TD sequence has been
extracted to a dedicted file:
  * "td/vectorize-with-patterns.mlir".

----------------------------------------------------------------------
**DEPENDS ON**
* https://github.com/llvm/llvm-project/pull/118977
* https://github.com/llvm/llvm-project/pull/119079

Please only review the top commit
---
 .../Linalg/td/vectorize-with-patterns.mlir    |  10 +
 .../Linalg/vectorize-tensor-extract.mlir      | 218 ++----------------
 2 files changed, 30 insertions(+), 198 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/td/vectorize-with-patterns.mlir

diff --git a/mlir/test/Dialect/Linalg/td/vectorize-with-patterns.mlir b/mlir/test/Dialect/Linalg/td/vectorize-with-patterns.mlir
new file mode 100644
index 00000000000000..f8d1a50d7430df
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/td/vectorize-with-patterns.mlir
@@ -0,0 +1,10 @@
+module @transforms attributes { transform.with_named_sequence } {
+  transform.named_sequence @vectorize_with_patterns(%module: !transform.any_op {transform.readonly}) {
+
+    %0 = transform.structured.match ops{["linalg.generic"]} in %module : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
+
+    transform.yield
+   }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index aa1e44166ec9d6..25fb7ed125f874 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
+// RUN: mlir-opt -split-input-file \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/vectorize-with-patterns.mlir' \
+// RUN: -transform-interpreter=entry-point=vectorize_with_patterns %s | FileCheck %s
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
@@ -27,15 +29,6 @@ func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi
 // CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]]
 // CHECK: vector.transfer_write %[[GATHER]]
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 #map = affine_map<() -> ()>
@@ -59,15 +52,6 @@ func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>)
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
 // CHECK:           vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 #map = affine_map<(n) -> (n)>
@@ -92,51 +76,35 @@ func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
 // CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @vectorize_nd_tensor_extract_scalar_broadcast(%arg0: tensor<3x3xf32>, %arg2: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
+func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
   %c0 = arith.constant 1 : index
   %c1 = arith.constant 2 : index
-  %2 = linalg.generic {
+
+  %res = linalg.generic {
     indexing_maps = [#map],
     iterator_types = ["parallel", "parallel", "parallel"]
-  } outs(%arg2 : tensor<1x1x3xf32>) {
+  } outs(%init : tensor<1x1x3xf32>) {
   ^bb0(%arg4: f32):
-    %7 = tensor.extract %arg0[%c0, %c1] : tensor<3x3xf32>
-    linalg.yield %7 : f32
+    %1 = tensor.extract %src[%c0, %c1] : tensor<3x3xf32>
+    linalg.yield %1 : f32
   } -> tensor<1x1x3xf32>
-  return %2 : tensor<1x1x3xf32>
+
+  return %res : tensor<1x1x3xf32>
 }
 
-// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1) -> (0, 0, 0)>
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
-// CHECK-SAME:      %[[ARG_0:.*]]: tensor<3x3xf32>,
-// CHECK-SAME:      %[[ARG_1:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[MASK:.*]] = vector.constant_mask [1] : vector<1xi1>
-// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C1]], %[[C2]]], {{.*}} {in_bounds = [true, true, true], permutation_map = #[[$MAP]]} : tensor<3x3xf32>, vector<1x1x3xf32> } : vector<1xi1> -> vector<1x1x3xf32>
-// CHECK:           %[[C0_2:.*]] = arith.constant 0 : index
-// CHECK:           vector.transfer_write %[[READ]], %[[ARG_1]]{{\[}}%[[C0_2]], %[[C0_2]], %[[C0_2]]] : vector<1x1x3xf32>, tensor<1x1x3xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 { vectorize_nd_extract }  : !transform.any_op
-    transform.yield
-   }
-}
+// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
+// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
 
 // -----
 
@@ -207,15 +175,6 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
 // CHECK:           %[[RES:.*]] = vector.transfer_write %[[BCAST]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
 // CHECK:           return %[[RES]] : tensor<3x1x1xf32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
  // -----
 
 func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16xf32>, %arg0: index, %arg2: index, %arg1: index, %arg4: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
@@ -259,15 +218,6 @@ func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16
 // CHECK:           return %[[VAL_21]] : tensor<1x4xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
@@ -309,15 +259,6 @@ func.func @vectorize_nd_tensor_extract_index_from_tensor(%arg0: tensor<3x3xf32>,
 // CHECK:    %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]]] [%[[T]]], %[[CST_1]], %[[PASSTHRU]] : tensor<3x3xf32>, vector<4x7x3x2xindex>, vector<4x7x3x2xi1>, vector<4x7x3x2xf32> into vector<4x7x3x2xf32>
 // CHECK:    vector.transfer_write %[[GATHER]], %[[ARG4]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true, true]} : vector<4x7x3x2xf32>, tensor<4x7x3x2xf32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
@@ -339,15 +280,6 @@ func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load(%
   return %1 : tensor<8x1xf32>
 }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load
 // CHECK-SAME: %[[ARG0:.*]]: tensor<8x128x768xf32>
 // CHECK-SAME: %[[ARG1:.*]]: index
@@ -390,15 +322,6 @@ func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>)
   return %res : tensor<8x1xf32>
 }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // CHECK-LABEL:   func.func @index_from_output_column_vector_gather_load(
 // CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
 // CHECK:           %[[C128:.*]] = arith.constant dense<128> : vector<1x8xindex>
@@ -437,15 +360,6 @@ func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf3
   return %res : tensor<8x1xf32>
 }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
 // CHECK-LABEL:   func.func @index_from_output_column_vector_contiguous_load(
 // CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
@@ -497,15 +411,6 @@ func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32
 // CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
 // CHECK:           return %[[VAL_14]] : tensor<5xf32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 // The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
@@ -540,15 +445,6 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<8
 // CHECK:           return %[[VAL_12]] : tensor<1x4xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> {
@@ -585,16 +481,6 @@ func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20
 // for address calculation also satisfy the required conditions).
 // CHECK:           vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32>
 
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 // The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
@@ -632,15 +518,6 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16
 // CHECK:           return %[[VAL_14]] : tensor<1x4xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 // Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
@@ -674,15 +551,6 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32
 // CHECK:           return %[[VAL_10]] : tensor<1x4xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 // Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
@@ -718,15 +586,6 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16
 // CHECK:           return %[[VAL_9]] : tensor<1x4xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 // The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
@@ -759,15 +618,6 @@ func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1:
 // CHECK:           return %[[VAL_12]] : tensor<5xf32>
 // CHECK:         }
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
@@ -788,15 +638,6 @@ func.func @vectorize_0d_tensor_extract(%arg0: tensor<f32>, %arg2: tensor<1x1x3xf
 // CHECK:           %[[EXTRACT:.*]] = vector.transfer_read %[[ARG_0]][], %{{.+}} : tensor<f32>
 // CHECK:           vector.broadcast %[[EXTRACT]] : vector<f32> to vector<1x1x3xf32>
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
 // -----
 
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
@@ -833,16 +674,6 @@ func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1
 // CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
 // CHECK:        vector.transfer_write %[[GATHER]]
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-     transform.yield
-   }
-}
-
-
 // -----
 
 func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
@@ -874,12 +705,3 @@ func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
 // CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}

>From b46a781f42d67fe2d1914d97f77e291d59b34121 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Tue, 10 Dec 2024 18:17:00 +0000
Subject: [PATCH 4/6] fixup! [mlir][nfc] Update vectorize-tensor-extract.mlir
 (2/N)

Restore CHECK-DAG
---
 mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 25fb7ed125f874..25435cf51a6156 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -98,10 +98,10 @@ func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
 // CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
 // CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[C2:.*]] = arith.constant 2 : index
-// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
 // CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>

>From adabdf492fc12601e668dbeac030361f20752cb7 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Sat, 7 Dec 2024 12:09:28 +0000
Subject: [PATCH 5/6] [mlir][nfc] Update vectorize-tensor-extract.mlir (3/N)

Tests in "vectorize-tensor-extract.mlir" are inconsistent and would
benefit from refactoring to:

* Clearly categorize tests into "contiguous load," "gather load," and
  "scalar load + broadcast" cases, reflecting the structure of
  tensor.extract vectorization.
* Unify variable naming (both MLIR and FileCheck).
* Ensure all tests exercise unmasked vectorization (masked vectorization
  is covered in "vectorize-tensor-extract-masked.mlir").
* Improve and standardize formatting.

These changes will make it easier to identify the test cases being
exercised and simplify future maintenance or refactoring.

This is patch 3/N in the series. Below is a summary of the changes in
this patch.

----------------------------------------------------------------------
Summary of patch 3/N
----------------------------------------------------------------------

* Cluster all tests for "scalar load + broadcast" together
* Unify MLIR and FileCheck variable names, e.g. `%input`, `%output` ->
  `%src`, `%init`.

Note, I haven't changed test function names to make it easier to track
changes (this PR is mostly about moving code). I will send a seperate PR
to rename the tests.

----------------------------------------------------------------------

**DEPENDS ON**
* https://github.com/llvm/llvm-project/pull/118977
* https://github.com/llvm/llvm-project/pull/119079
* https://github.com/llvm/llvm-project/pull/118977

Please only review the top commit
---
 .../Linalg/vectorize-tensor-extract.mlir      | 269 +++++++++---------
 1 file changed, 137 insertions(+), 132 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 25435cf51a6156..0ee3898af8baac 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -31,83 +31,6 @@ func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi
 
 // -----
 
-#map = affine_map<() -> ()>
-func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>) -> tensor<f32> {
-  %res = linalg.generic {
-    indexing_maps = [#map],
-    iterator_types = []
-  } outs(%init : tensor<f32>) {
-  ^bb0(%in: f32):
-    %1 = tensor.extract %src[] : tensor<f32>
-    linalg.yield %1 : f32
-  } -> tensor<f32>
-
-  return %res : tensor<f32>
-}
-
-// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_0d(
-// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
-// CHECK:           vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
-
-// -----
-
-#map = affine_map<(n) -> (n)>
-func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32>) -> tensor<1xf32> {
-  %res = linalg.generic {
-    indexing_maps = [#map],
-    iterator_types = ["parallel"]
-  } outs(%init : tensor<1xf32>) {
-  ^bb0(%in: f32):
-    %1 = tensor.extract %src[] : tensor<f32>
-    linalg.yield %1 : f32
-  } -> tensor<1xf32>
-
-  return %res : tensor<1xf32>
-}
-// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_1d(
-// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
-// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
-// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-  %c0 = arith.constant 1 : index
-  %c1 = arith.constant 2 : index
-
-  %res = linalg.generic {
-    indexing_maps = [#map],
-    iterator_types = ["parallel", "parallel", "parallel"]
-  } outs(%init : tensor<1x1x3xf32>) {
-  ^bb0(%arg4: f32):
-    %1 = tensor.extract %src[%c0, %c1] : tensor<3x3xf32>
-    linalg.yield %1 : f32
-  } -> tensor<1x1x3xf32>
-
-  return %res : tensor<1x1x3xf32>
-}
-
-// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
-// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
-// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
-// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
-
-// -----
-
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 func.func @vectorize_nd_tensor_extract_transfer_read_basic(
     %arg0: tensor<3x3x3xf32>,
@@ -144,37 +67,6 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic(
 // CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[IDX3]]], %[[CST]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32>
 // CHECK:   vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
 
-// Same as example above, but reading into a column tensor.
-
-// TODO: Currently this fails to vectorise when the indices are non-constant.
-
-func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
-    %input: tensor<3x3x3xf32>,
-    %output: tensor<3x1x1xf32>) -> tensor<3x1x1xf32> {
-
-  %c0 = arith.constant 0 : index
-  %res = linalg.generic {
-    indexing_maps = [#map],
-    iterator_types = ["parallel", "parallel", "parallel"]
-  } outs(%output : tensor<3x1x1xf32>) {
-  ^bb0(%out: f32):
-    %5 = tensor.extract %input[%c0, %c0, %c0] : tensor<3x3x3xf32>
-    linalg.yield %5 : f32
-  } -> tensor<3x1x1xf32>
-
-  return %res : tensor<3x1x1xf32>
-}
-
-// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
-// CHECK-SAME:      %[[INPUT:.*]]: tensor<3x3x3xf32>,
-// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<3x1x1xf32>)
-// CHECK-DAG:        %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:        %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[READ:.*]] = vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
-// CHECK:           %[[BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
-// CHECK:           %[[RES:.*]] = vector.transfer_write %[[BCAST]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
-// CHECK:           return %[[RES]] : tensor<3x1x1xf32>
-
  // -----
 
 func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16xf32>, %arg0: index, %arg2: index, %arg1: index, %arg4: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
@@ -620,26 +512,6 @@ func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1:
 
 // -----
 
-#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @vectorize_0d_tensor_extract(%arg0: tensor<f32>, %arg2: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-  %2 = linalg.generic {
-    indexing_maps = [#map1],
-    iterator_types = ["parallel", "parallel", "parallel"]
-  } outs(%arg2 : tensor<1x1x3xf32>) {
-  ^bb0(%arg4: f32):
-    %7 = tensor.extract %arg0[] : tensor<f32>
-    linalg.yield %7 : f32
-  } -> tensor<1x1x3xf32>
-  return %2 : tensor<1x1x3xf32>
-}
-
-// CHECK-LABEL:   func.func @vectorize_0d_tensor_extract(
-// CHECK-SAME:     %[[ARG_0:.*]]: tensor<f32>
-// CHECK:           %[[EXTRACT:.*]] = vector.transfer_read %[[ARG_0]][], %{{.+}} : tensor<f32>
-// CHECK:           vector.broadcast %[[EXTRACT]] : vector<f32> to vector<1x1x3xf32>
-
-// -----
-
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
 func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
@@ -674,17 +546,118 @@ func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1
 // CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
 // CHECK:        vector.transfer_write %[[GATHER]]
 
+//===----------------------------------------------------------------------===//
+// Scalar load + broadcast
+//===----------------------------------------------------------------------===//
+
 // -----
 
-func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
+  %c0 = arith.constant 1 : index
+  %c1 = arith.constant 2 : index
+
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } outs(%init : tensor<1x1x3xf32>) {
+  ^bb0(%arg4: f32):
+    %1 = tensor.extract %src[%c0, %c1] : tensor<3x3xf32>
+    linalg.yield %1 : f32
+  } -> tensor<1x1x3xf32>
+
+  return %res : tensor<1x1x3xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
+// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
+
+// -----
+
+#map = affine_map<() -> ()>
+func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>) -> tensor<f32> {
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = []
+  } outs(%init : tensor<f32>) {
+  ^bb0(%in: f32):
+    %1 = tensor.extract %src[] : tensor<f32>
+    linalg.yield %1 : f32
+  } -> tensor<f32>
+
+  return %res : tensor<f32>
+}
+
+// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_0d(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
+
+// -----
+
+#map = affine_map<(n) -> (n)>
+func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32>) -> tensor<1xf32> {
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel"]
+  } outs(%init : tensor<1xf32>) {
+  ^bb0(%in: f32):
+    %1 = tensor.extract %src[] : tensor<f32>
+    linalg.yield %1 : f32
+  } -> tensor<1xf32>
+
+  return %res : tensor<1xf32>
+}
+// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_1d(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
+// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
+
+// -----
+
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @vectorize_0d_tensor_extract(%src: tensor<f32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
+  %res = linalg.generic {
+    indexing_maps = [#map1],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } outs(%init : tensor<1x1x3xf32>) {
+  ^bb0(%arg4: f32):
+    %1 = tensor.extract %src[] : tensor<f32>
+    linalg.yield %1 : f32
+  } -> tensor<1x1x3xf32>
+  return %res : tensor<1x1x3xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_0d_tensor_extract(
+// CHECK-SAME:     %[[SRC:.*]]: tensor<f32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %{{.+}} : tensor<f32>
+// CHECK:           vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
+
+// -----
+
+func.func @scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
   %c4 = arith.constant 4 : index
   %c0 = arith.constant 0 : index
   %src = arith.constant dense<[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
 
   %res = linalg.generic {
     indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-    iterator_types = ["parallel", "parallel", "parallel"]}
-    outs(%init : tensor<1x1x4xi32>) {
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } outs(%init : tensor<1x1x4xi32>) {
 
     ^bb0(%out: i32):
       %idx = linalg.index 0 : index
@@ -695,7 +668,7 @@ func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor
   return %res : tensor<1x1x4xi32>
 }
 
-// CHECK-LABEL:   func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(
+// CHECK-LABEL:   func.func @scalar_read_with_broadcast_from_column_tensor
 // CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
 // CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
@@ -705,3 +678,35 @@ func.func @vectorize_scalar_read_with_broadcast_from_column_tensor(%init: tensor
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
 // CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
+
+// -----
+
+// TODO: Currently this fails to vectorise when the indices are non-constant.
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
+    %src: tensor<3x3x3xf32>,
+    %init: tensor<3x1x1xf32>) -> tensor<3x1x1xf32> {
+
+  %c0 = arith.constant 0 : index
+
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } outs(%init : tensor<3x1x1xf32>) {
+  ^bb0(%out: f32):
+    %1 = tensor.extract %src[%c0, %c0, %c0] : tensor<3x3x3xf32>
+    linalg.yield %1 : f32
+  } -> tensor<3x1x1xf32>
+
+  return %res : tensor<3x1x1xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3x3xf32>,
+// CHECK-SAME:      %[[INIT:.*]]: tensor<3x1x1xf32>)
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
+// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
+// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>

>From 1dcd71fb6af55f21798a21c564ed58861552b687 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Tue, 10 Dec 2024 18:25:28 +0000
Subject: [PATCH 6/6] fixup! [mlir][nfc] Update vectorize-tensor-extract.mlir
 (3/N)

Add DAG to FileCHeck prefixes
---
 .../Linalg/vectorize-tensor-extract.mlir      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 0ee3898af8baac..cfa5b5d569ca9d 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -572,10 +572,10 @@ func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
 // CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
 // CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[C2:.*]] = arith.constant 2 : index
-// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
 // CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
@@ -621,8 +621,8 @@ func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32
 // CHECK-LABEL:   func.func @extract_scalar_from_0d_into_1d(
 // CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
 // CHECK-SAME:      %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
 // CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
@@ -670,10 +670,10 @@ func.func @scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32
 
 // CHECK-LABEL:   func.func @scalar_read_with_broadcast_from_column_tensor
 // CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
-// CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
-// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
+// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
+// CHECK-DAG:       %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
 // CHECK:           %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex>
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
 // CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>