[Mlir-commits] [mlir] [MLIR][Linalg] Scalable Vectorization of Reduction (PR #97788)

Fri Jul 19 20:43:01 PDT 2024

https://github.com/zhaoshiz updated https://github.com/llvm/llvm-project/pull/97788

>From 086187351aed5236501d0a23af80fd03b64f49bd Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Fri, 19 Jul 2024 20:33:11 -0700
Subject: [PATCH 1/2] [MLIR][Linalg] Scalable Vectorization of Reduction on the
 Trailing Dimension

Allow scalable vectorization of linalg::reduce and linalg::generic
with reduction iterator. For now, only reduction on the trailing
dimension is supported.
---
 .../Linalg/Transforms/Vectorization.cpp       | 25 ++++--
 .../Linalg/vectorization-scalable.mlir        | 82 +++++++++++++++++++
 .../Linalg/vectorization-unsupported.mlir     | 20 ++---
 3 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7f7168eb86832..b2324d8aaf305 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -586,6 +586,12 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
       llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
 }
 
+static bool hasLinalgReduction(LinalgOp &op) {
+  return isa<linalg::ReduceOp>(op) ||
+         (isa<linalg::GenericOp>(op) &&
+          llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
+}
+
 /// Build a vector.transfer_write of `value` into `outputOperand` at indices set
 /// to all `0`; where `outputOperand` is an output operand of the LinalgOp
 /// currently being vectorized. If `dest` has null rank, build an memref.store.
@@ -1787,6 +1793,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
   if (isa<ConvolutionOpInterface>(op.getOperation()))
     return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
 
+  if (hasLinalgReduction(op))
+    return reductionPreconditions(op);
+
   // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
   // linalg.copy ops and ops that implement ContractionOpInterface for now.
   if (!isElementwise(op) &&
@@ -1976,6 +1985,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
   //  1. exactly 1 dim is scalable and that's the _last_ parallel dim
   //  2. exactly 2 dims are scalable and those are the _last two adjacent_
   //     parallel dims
+  //  3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
   // The 2nd restriction above means that only Matmul-like Ops are supported
   // when 2 dims are scalable, e.g. :
   //    * iterators = [parallel, parallel, reduction]
@@ -1992,11 +2002,15 @@ vectorizeScalableVectorPrecondition(Operation *op,
     scalableFlags.pop_back();
   }
 
-  // TODO: Support scalable vectorisation for reduction dims
-  if (iterators.back() == utils::IteratorType::reduction)
-    return failure();
+  if (iterators.back() == utils::IteratorType::reduction) {
+    if (iterators.size() != inputVectorSizes.size()) {
+       LDBG("Non-trailing reduction dim requested for scalable "
+            "vectorization\n");
+       return failure();
+    }
+  }
 
-  // If this is not the _last_ parallel dim, 1. above is not met
+  // If this is not the _last_ parallel dim, 1. or 3. above is not met
   if (seenParalell)
     return failure();
 
@@ -2017,7 +2031,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
   // presence of scalable vectors
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
                  isa<linalg::MatmulTransposeAOp>(op) ||
-                 isa<linalg::DepthwiseConv1DNwcWcOp>(op));
+                 isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
+                 hasLinalgReduction(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
index 4423ee6ea6a51..c29d8816d5f81 100644
--- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
@@ -189,3 +189,85 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
+                                          %arg1: tensor<f32>) -> tensor<f32> {
+
+  %0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
+  (%in: f32, %init: f32) {
+    %0 = arith.addf %in, %init : f32
+    linalg.yield %0 : f32
+  }
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_reduction_scalable_1d(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:          %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:          %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?xf32>
+// CHECK:          %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:          %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:          %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1>
+// CHECK:          %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK:          %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:          %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor<f32>, vector<f32>
+// CHECK:          %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector<f32>
+// CHECK:          %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction <add>, %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector<f32>
+// CHECK:          %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector<f32>, tensor<f32>
+// CHECK:          return %[[VAL_11]] : tensor<f32>
+// CHECK:        }
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
+func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
+                                          %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d0)>],
+                        iterator_types = ["parallel", "reduction"] }
+    ins(%arg0 : tensor<?x?xf32>)
+    outs(%arg1 : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL:  func.func @vectorize_dynamic_reduction_scalable_2d(
+// CHECK-SAME:     %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:    %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?x?xf32>
+// CHECK:    %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:    %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor<?x?xf32>
+// CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1>
+// CHECK:    %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+// CHECK:    %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:    %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1>
+// CHECK:    %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor<?xf32>, vector<1xf32> } : vector<1xi1> -> vector<1xf32>
+// CHECK:    %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction <add>, %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32>
+// CHECK:    %[[VAL_12:.*]] = arith.constant 0 : index
+// CHECK:    %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor<?xf32> } : vector<1xi1> -> tensor<?xf32>
+// CHECK:    return %[[VAL_13]] : tensor<?xf32>
+// CHECK:  }
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index c7ec39b0dbfb3..164e7b23b1a1c 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
-                                  %acc: tensor<f32>) -> tensor<f32> {
+func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>,
+                                              %acc: tensor<?xf32>) -> tensor<?xf32> {
 
   // expected-error @+1 {{Attempted to vectorize, but failed}}
-  %0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
+  %0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0]
   (%in: f32, %init: f32) {
     %0 = arith.addf %in, %init : f32
     linalg.yield %0 : f32
   }
-  return %0 : tensor<f32>
+  return %0 : tensor<?xf32>
 }
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
-                                                 %acc: tensor<?xf32>) -> tensor<?xf32> {
+func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>,
+                                                         %acc: tensor<?xf32>) -> tensor<?xf32> {
 
   // expected-error @+1 {{Attempted to vectorize, but failed}}
   %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                                         affine_map<(d0, d1) -> (d0)>],
-                        iterator_types = ["parallel", "reduction"] }
+                                         affine_map<(d0, d1) -> (d1)>],
+                        iterator_types = ["reduction", "parallel"] }
     ins(%input : tensor<?x?xf32>)
     outs(%acc : tensor<?xf32>) {
     ^bb(%in: f32, %out: f32) :
@@ -170,7 +170,7 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
     transform.yield
   }
 }

>From fba222e9377302c8263a847ba30268c334d2c5bf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Fri, 19 Jul 2024 20:40:15 -0700
Subject: [PATCH 2/2] [MLIR][Linalg] Add integration tests of scalable
 vectorization of reduction

Note: I don't have a setup to run these tests natively (arm64-linux with sve).
I am able to run them using QEMU on a x86_64-linux with below cmake variables
when building llvm:

  -DARM_EMULATOR_EXECUTABLE="<path_to_qemu_bin>/qemu-aarch64" \
  -DARM_EMULATOR_OPTIONS="-L /usr/aarch64-linux-gnu" \
  -DARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE="<path_to_llvm_arm64_build>/bin/mlir-cpu-runner-arm64" \
  -DARM_EMULATOR_UTILS_LIB_DIR="<path_to_llvm_arm64_build>/lib"
---
 .../Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir  | 134 +++++++++++++++++
 .../Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir  | 136 ++++++++++++++++++
 2 files changed, 270 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
new file mode 100644
index 0000000000000..4bcb2ef79da83
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
@@ -0,0 +1,134 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_1d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE:    -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+
+// REDEFINE: %{entry_point} = generic_reduce_1d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+
+func.func @reduce_1d_f32() {
+  // 1-D Tensor
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+  %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+  // Initialise the tensors
+  %pi = arith.constant  3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+  %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+  // Reduce
+  %C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
+    (%in: f32, %init: f32) {
+      %0 = arith.addf %in, %init : f32
+      linalg.yield %0 : f32
+    }
+
+  // Print and verify the output
+  // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+  // REDUCE-NEXT: [3141.6]
+
+  %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @generic_reduce_1d_f32() {
+  // 1-D Tensor
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+  %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+  // Initialise the tensors
+  %pi = arith.constant  3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+  %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+  // Reduce
+  %C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
+                                             affine_map<(d0) -> ()>],
+                            iterator_types = ["reduction"] }
+    ins(%A_in : tensor<?xf32>)
+    outs(%C_in : tensor<f32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<f32>
+
+  // Print and verify the output
+  // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+  // GENERIC-NEXT: [3141.6]
+
+  %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  // A sequence that will tile and vectorise a Reduce Op
+  transform.named_sequence @tile_and_vectorize_reduce(%func
+    : !transform.op<"func.func"> {transform.readonly}) {
+
+    // Step 0: Get a handle to the reduce Op
+    %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+      : (!transform.op<"func.func">) -> !transform.any_op
+
+    // Step 1: Tile
+    %tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op
+
+    // Step 3: Lower vector.multi_reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+
+  // A sequence that goes over all functions in tis module and applies
+  // "tile_and_vectorize_reduce"
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %funcs = transform.structured.match ops{["func.func"]} in %module
+        : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.foreach %funcs : !transform.op<"func.func"> {
+      ^bb2(%func : !transform.op<"func.func">):
+        transform.include @tile_and_vectorize_reduce failures(propagate)
+        (%func) : (!transform.op<"func.func">) -> ()
+    }
+    transform.yield
+  }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
new file mode 100644
index 0000000000000..63d0ac5126e66
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
@@ -0,0 +1,136 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_2d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE:    -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+
+// REDEFINE: %{entry_point} = generic_reduce_2d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+
+func.func @reduce_2d_f32() {
+  // 2-D Tensor
+  %M = arith.constant 16 : index
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+  // Initialise the tensors
+  %pi = arith.constant  3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+  // Reduce
+  %C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1]
+    (%in: f32, %init: f32) {
+      %0 = arith.addf %in, %init : f32
+      linalg.yield %0 : f32
+    }
+
+  // Print and verify the output
+  // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+  // REDUCE-NEXT: [3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6]
+
+  %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+func.func @generic_reduce_2d_f32() {
+  // 2-D Tensor
+  %M = arith.constant 16 : index
+  %N = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the input and output tensors
+  %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+  // Initialise the tensors
+  %pi = arith.constant  3.1416 : f32
+  %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+  // Reduce
+  %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                             affine_map<(d0, d1) -> (d0)>],
+                            iterator_types = ["parallel", "reduction"] }
+    ins(%A_in : tensor<?x?xf32>)
+    outs(%C_in : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+
+  // Print and verify the output
+  // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT\n"
+
+  // GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+  // GENERIC-NEXT: [3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6,  3141.6]
+
+  %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT\n"
+
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  // A sequence that will tile and vectorise a Reduce Op
+  transform.named_sequence @tile_and_vectorize_reduce(%func
+    : !transform.op<"func.func"> {transform.readonly}) {
+
+    // Step 0: Get a handle to the reduce Op
+    %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+      : (!transform.op<"func.func">) -> !transform.any_op
+
+    // Step 1: Tile
+    %tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op
+
+    // Step 3: Lower vector.multi_reduction
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+
+  // A sequence that goes over all functions in tis module and applies
+  // "tile_and_vectorize_reduce"
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %funcs = transform.structured.match ops{["func.func"]} in %module
+        : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.foreach %funcs : !transform.op<"func.func"> {
+      ^bb2(%func : !transform.op<"func.func">):
+        transform.include @tile_and_vectorize_reduce failures(propagate)
+        (%func) : (!transform.op<"func.func">) -> ()
+    }
+    transform.yield
+  }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)