[Mlir-commits] [mlir] [MLIR][Linalg] Scalable Vectorization of Reduction on the Trailing Dimension (PR #97788)
Zhaoshi Zheng
llvmlistbot at llvm.org
Mon Jul 22 16:37:19 PDT 2024
https://github.com/zhaoshiz updated https://github.com/llvm/llvm-project/pull/97788
>From 086187351aed5236501d0a23af80fd03b64f49bd Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Fri, 19 Jul 2024 20:33:11 -0700
Subject: [PATCH 1/7] [MLIR][Linalg] Scalable Vectorization of Reduction on the
Trailing Dimension
Allow scalable vectorization of linalg::reduce and linalg::generic
with reduction iterator. For now, only reduction on the trailing
dimension is supported.
---
.../Linalg/Transforms/Vectorization.cpp | 25 ++++--
.../Linalg/vectorization-scalable.mlir | 82 +++++++++++++++++++
.../Linalg/vectorization-unsupported.mlir | 20 ++---
3 files changed, 112 insertions(+), 15 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7f7168eb86832..b2324d8aaf305 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -586,6 +586,12 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
}
+static bool hasLinalgReduction(LinalgOp &op) {
+ return isa<linalg::ReduceOp>(op) ||
+ (isa<linalg::GenericOp>(op) &&
+ llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
+}
+
/// Build a vector.transfer_write of `value` into `outputOperand` at indices set
/// to all `0`; where `outputOperand` is an output operand of the LinalgOp
/// currently being vectorized. If `dest` has null rank, build an memref.store.
@@ -1787,6 +1793,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
if (isa<ConvolutionOpInterface>(op.getOperation()))
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
+ if (hasLinalgReduction(op))
+ return reductionPreconditions(op);
+
// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
// linalg.copy ops and ops that implement ContractionOpInterface for now.
if (!isElementwise(op) &&
@@ -1976,6 +1985,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
// 1. exactly 1 dim is scalable and that's the _last_ parallel dim
// 2. exactly 2 dims are scalable and those are the _last two adjacent_
// parallel dims
+ // 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
// The 2nd restriction above means that only Matmul-like Ops are supported
// when 2 dims are scalable, e.g. :
// * iterators = [parallel, parallel, reduction]
@@ -1992,11 +2002,15 @@ vectorizeScalableVectorPrecondition(Operation *op,
scalableFlags.pop_back();
}
- // TODO: Support scalable vectorisation for reduction dims
- if (iterators.back() == utils::IteratorType::reduction)
- return failure();
+ if (iterators.back() == utils::IteratorType::reduction) {
+ if (iterators.size() != inputVectorSizes.size()) {
+ LDBG("Non-trailing reduction dim requested for scalable "
+ "vectorization\n");
+ return failure();
+ }
+ }
- // If this is not the _last_ parallel dim, 1. above is not met
+ // If this is not the _last_ parallel dim, 1. or 3. above is not met
if (seenParalell)
return failure();
@@ -2017,7 +2031,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
// presence of scalable vectors
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
- isa<linalg::DepthwiseConv1DNwcWcOp>(op));
+ isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
+ hasLinalgReduction(linalgOp));
}
LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
index 4423ee6ea6a51..c29d8816d5f81 100644
--- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
@@ -189,3 +189,85 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
+ %arg1: tensor<f32>) -> tensor<f32> {
+
+ %0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %0 = arith.addf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?xf32>
+// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1>
+// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor<f32>, vector<f32>
+// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector<f32>
+// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction <add>, %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
+// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector<f32>
+// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector<f32>, tensor<f32>
+// CHECK: return %[[VAL_11]] : tensor<f32>
+// CHECK: }
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
+func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
+ %arg1: tensor<?xf32>) -> tensor<?xf32> {
+ %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"] }
+ ins(%arg0 : tensor<?x?xf32>)
+ outs(%arg1 : tensor<?xf32>) {
+ ^bb(%in: f32, %out: f32) :
+ %0 = arith.addf %in, %out : f32
+ linalg.yield %0 : f32
+ } -> tensor<?xf32>
+ return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?x?xf32>
+// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor<?x?xf32>
+// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1>
+// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
+// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1>
+// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor<?xf32>, vector<1xf32> } : vector<1xi1> -> vector<1xf32>
+// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction <add>, %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32>
+// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index
+// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor<?xf32> } : vector<1xi1> -> tensor<?xf32>
+// CHECK: return %[[VAL_13]] : tensor<?xf32>
+// CHECK: }
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+ transform.yield
+ }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index c7ec39b0dbfb3..164e7b23b1a1c 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} {
// -----
-func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
- %acc: tensor<f32>) -> tensor<f32> {
+func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>,
+ %acc: tensor<?xf32>) -> tensor<?xf32> {
// expected-error @+1 {{Attempted to vectorize, but failed}}
- %0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
+ %0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
- return %0 : tensor<f32>
+ return %0 : tensor<?xf32>
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}
// -----
-func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
- %acc: tensor<?xf32>) -> tensor<?xf32> {
+func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>,
+ %acc: tensor<?xf32>) -> tensor<?xf32> {
// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0)>],
- iterator_types = ["parallel", "reduction"] }
+ affine_map<(d0, d1) -> (d1)>],
+ iterator_types = ["reduction", "parallel"] }
ins(%input : tensor<?x?xf32>)
outs(%acc : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
@@ -170,7 +170,7 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}
>From fba222e9377302c8263a847ba30268c334d2c5bf Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Fri, 19 Jul 2024 20:40:15 -0700
Subject: [PATCH 2/7] [MLIR][Linalg] Add integration tests of scalable
vectorization of reduction
Note: I don't have a setup to run these tests natively (arm64-linux with sve).
I am able to run them using QEMU on a x86_64-linux with below cmake variables
when building llvm:
-DARM_EMULATOR_EXECUTABLE="<path_to_qemu_bin>/qemu-aarch64" \
-DARM_EMULATOR_OPTIONS="-L /usr/aarch64-linux-gnu" \
-DARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE="<path_to_llvm_arm64_build>/bin/mlir-cpu-runner-arm64" \
-DARM_EMULATOR_UTILS_LIB_DIR="<path_to_llvm_arm64_build>/lib"
---
.../Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir | 134 +++++++++++++++++
.../Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir | 136 ++++++++++++++++++
2 files changed, 270 insertions(+)
create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
new file mode 100644
index 0000000000000..4bcb2ef79da83
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
@@ -0,0 +1,134 @@
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_1d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+
+// REDEFINE: %{entry_point} = generic_reduce_1d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+
+func.func @reduce_1d_f32() {
+ // 1-D Tensor
+ %N = arith.constant 1000 : index
+ %c0_f32 = arith.constant 0.0 : f32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+ %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3.1416 : f32
+ %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+ %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+ // Reduce
+ %C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %0 = arith.addf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+
+ // Print and verify the output
+ // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+ // REDUCE-NEXT: [3141.6]
+
+ %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+ call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+ // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+func.func @generic_reduce_1d_f32() {
+ // 1-D Tensor
+ %N = arith.constant 1000 : index
+ %c0_f32 = arith.constant 0.0 : f32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
+ %C_alloc = bufferization.alloc_tensor() : tensor<f32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3.1416 : f32
+ %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
+ %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
+
+ // Reduce
+ %C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
+ affine_map<(d0) -> ()>],
+ iterator_types = ["reduction"] }
+ ins(%A_in : tensor<?xf32>)
+ outs(%C_in : tensor<f32>) {
+ ^bb(%in: f32, %out: f32) :
+ %0 = arith.addf %in, %out : f32
+ linalg.yield %0 : f32
+ } -> tensor<f32>
+
+ // Print and verify the output
+ // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+ // GENERIC-NEXT: [3141.6]
+
+ %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
+ call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+ // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ // A sequence that will tile and vectorise a Reduce Op
+ transform.named_sequence @tile_and_vectorize_reduce(%func
+ : !transform.op<"func.func"> {transform.readonly}) {
+
+ // Step 0: Get a handle to the reduce Op
+ %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+ : (!transform.op<"func.func">) -> !transform.any_op
+
+ // Step 1: Tile
+ %tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Step 2: Vectorize
+ transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op
+
+ // Step 3: Lower vector.multi_reduction
+ transform.apply_patterns to %func {
+ transform.apply_patterns.vector.lower_masked_transfers
+ transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+ } : !transform.op<"func.func">
+
+ transform.yield
+ }
+
+ // A sequence that goes over all functions in tis module and applies
+ // "tile_and_vectorize_reduce"
+ transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+ %funcs = transform.structured.match ops{["func.func"]} in %module
+ : (!transform.any_op) -> !transform.op<"func.func">
+
+ transform.foreach %funcs : !transform.op<"func.func"> {
+ ^bb2(%func : !transform.op<"func.func">):
+ transform.include @tile_and_vectorize_reduce failures(propagate)
+ (%func) : (!transform.op<"func.func">) -> ()
+ }
+ transform.yield
+ }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
new file mode 100644
index 0000000000000..63d0ac5126e66
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
@@ -0,0 +1,136 @@
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = reduce_2d_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+
+// REDEFINE: %{entry_point} = generic_reduce_2d_f32
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+
+func.func @reduce_2d_f32() {
+ // 2-D Tensor
+ %M = arith.constant 16 : index
+ %N = arith.constant 1000 : index
+ %c0_f32 = arith.constant 0.0 : f32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+ %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3.1416 : f32
+ %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+ // Reduce
+ %C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %0 = arith.addf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+
+ // Print and verify the output
+ // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+ // REDUCE-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]
+
+ %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+ call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+ // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+func.func @generic_reduce_2d_f32() {
+ // 2-D Tensor
+ %M = arith.constant 16 : index
+ %N = arith.constant 1000 : index
+ %c0_f32 = arith.constant 0.0 : f32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+ %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3.1416 : f32
+ %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
+
+ // Reduce
+ %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"] }
+ ins(%A_in : tensor<?x?xf32>)
+ outs(%C_in : tensor<?xf32>) {
+ ^bb(%in: f32, %out: f32) :
+ %0 = arith.addf %in, %out : f32
+ linalg.yield %0 : f32
+ } -> tensor<?xf32>
+
+ // Print and verify the output
+ // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+ // GENERIC-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]
+
+ %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
+ call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+ // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ // A sequence that will tile and vectorise a Reduce Op
+ transform.named_sequence @tile_and_vectorize_reduce(%func
+ : !transform.op<"func.func"> {transform.readonly}) {
+
+ // Step 0: Get a handle to the reduce Op
+ %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
+ : (!transform.op<"func.func">) -> !transform.any_op
+
+ // Step 1: Tile
+ %tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Step 2: Vectorize
+ transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op
+
+ // Step 3: Lower vector.multi_reduction
+ transform.apply_patterns to %func {
+ transform.apply_patterns.vector.lower_masked_transfers
+ transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
+ } : !transform.op<"func.func">
+
+ transform.yield
+ }
+
+ // A sequence that goes over all functions in tis module and applies
+ // "tile_and_vectorize_reduce"
+ transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+ %funcs = transform.structured.match ops{["func.func"]} in %module
+ : (!transform.any_op) -> !transform.op<"func.func">
+
+ transform.foreach %funcs : !transform.op<"func.func"> {
+ ^bb2(%func : !transform.op<"func.func">):
+ transform.include @tile_and_vectorize_reduce failures(propagate)
+ (%func) : (!transform.op<"func.func">) -> ()
+ }
+ transform.yield
+ }
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
>From 1e5ef34b12ce814d3b136cb38b5ccd98a0dd4a78 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Sat, 20 Jul 2024 14:21:30 -0700
Subject: [PATCH 3/7] fix per clang-format
---
mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index b2324d8aaf305..7e3048b15fb9a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2004,9 +2004,9 @@ vectorizeScalableVectorPrecondition(Operation *op,
if (iterators.back() == utils::IteratorType::reduction) {
if (iterators.size() != inputVectorSizes.size()) {
- LDBG("Non-trailing reduction dim requested for scalable "
- "vectorization\n");
- return failure();
+ LDBG("Non-trailing reduction dim requested for scalable "
+ "vectorization\n");
+ return failure();
}
}
>From 75f0da224a077ddc8614e5a3ad085444e6f98935 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Sun, 21 Jul 2024 17:45:16 -0700
Subject: [PATCH 4/7] Addressed review comments
---
.../Linalg/Transforms/Vectorization.cpp | 35 ++++++----
.../Linalg/vectorization-scalable.mlir | 60 ++++++++---------
.../Linalg/vectorization-unsupported.mlir | 2 +-
.../Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir | 65 +++++++++++++++----
.../Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir | 48 +++++++++++++-
5 files changed, 151 insertions(+), 59 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7e3048b15fb9a..d17fae307e817 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -586,7 +586,9 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
}
-static bool hasLinalgReduction(LinalgOp &op) {
+/// Check if `op` is a linalg.reduce or a linalg.generic that has at least one
+/// reduction iterator.
+static bool hasReductionIterator(LinalgOp &op) {
return isa<linalg::ReduceOp>(op) ||
(isa<linalg::GenericOp>(op) &&
llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
@@ -1793,7 +1795,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
if (isa<ConvolutionOpInterface>(op.getOperation()))
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
- if (hasLinalgReduction(op))
+ if (hasReductionIterator(op))
return reductionPreconditions(op);
// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
@@ -2002,18 +2004,27 @@ vectorizeScalableVectorPrecondition(Operation *op,
scalableFlags.pop_back();
}
- if (iterators.back() == utils::IteratorType::reduction) {
- if (iterators.size() != inputVectorSizes.size()) {
- LDBG("Non-trailing reduction dim requested for scalable "
- "vectorization\n");
- return failure();
+ switch (iterators.back()) {
+ case utils::IteratorType::reduction: {
+ // Check 3. above is met.
+ if (iterators.size() != inputVectorSizes.size()) {
+ LDBG("Non-trailing reduction dim requested for scalable "
+ "vectorization\n");
+ return failure();
+ }
+ break;
+ }
+ case utils::IteratorType::parallel: {
+ // Check 1. and 2. above are met.
+ if (seenParalell) {
+ LDBG("Inner parallel dim not requested for scalable "
+ "vectorization\n");
+ return failure();
+ }
+ break;
}
}
- // If this is not the _last_ parallel dim, 1. or 3. above is not met
- if (seenParalell)
- return failure();
-
// If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
// supported for which expect the folowing config:
// * iterators = [parallel, parallel, reduction]
@@ -2032,7 +2043,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
- hasLinalgReduction(linalgOp));
+ hasReductionIterator(linalgOp));
}
LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
index c29d8816d5f81..df2f8d434f36b 100644
--- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
@@ -193,7 +193,7 @@ module attributes {transform.with_named_sequence} {
// -----
func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
- %arg1: tensor<f32>) -> tensor<f32> {
+ %arg1: tensor<f32>) -> tensor<f32> {
%0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
@@ -205,20 +205,18 @@ func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?xf32>
-// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1>
-// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
-// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor<f32>, vector<f32>
-// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector<f32>
-// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction <add>, %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
-// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector<f32>
-// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector<f32>, tensor<f32>
-// CHECK: return %[[VAL_11]] : tensor<f32>
-// CHECK: }
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
+// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32>
+// CHECK: %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector<f32>
+// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
+// CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32>
+// CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32>
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -247,27 +245,25 @@ func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?x?xf32>
-// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index
-// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor<?x?xf32>
-// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1>
-// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
-// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1>
-// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor<?xf32>, vector<1xf32> } : vector<1xi1> -> vector<1xf32>
-// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction <add>, %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32>
-// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index
-// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor<?xf32> } : vector<1xi1> -> tensor<?xf32>
-// CHECK: return %[[VAL_13]] : tensor<?xf32>
-// CHECK: }
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1>
+// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32>
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
+// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op
transform.yield
}
}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index 164e7b23b1a1c..6c4de1635028f 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -151,7 +151,7 @@ module attributes {transform.with_named_sequence} {
// -----
-func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>,
+func.func @linalg_generic_reduction_scalable_leading_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {
// expected-error @+1 {{Attempted to vectorize, but failed}}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
index 4bcb2ef79da83..7cdb35918c4c0 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
@@ -8,10 +8,13 @@
// RUN: %{compile}
-// RUN: %{run} | FileCheck %s --check-prefix=REDUCE
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-F32
+
+// REDEFINE: %{entry_point} = reduce_1d_i32
+// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-I32
// REDEFINE: %{entry_point} = generic_reduce_1d_f32
-// RUN: %{run} | FileCheck %s --check-prefix=GENERIC
+// RUN: %{run} | FileCheck %s --check-prefix=GENERIC-F32
func.func @reduce_1d_f32() {
// 1-D Tensor
@@ -23,7 +26,7 @@ func.func @reduce_1d_f32() {
%C_alloc = bufferization.alloc_tensor() : tensor<f32>
// Initialise the tensors
- %pi = arith.constant 3.1416 : f32
+ %pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
@@ -35,16 +38,53 @@ func.func @reduce_1d_f32() {
}
// Print and verify the output
- // REDUCE-LABEL: SVE: START OF TEST OUTPUT
+ // REDUCE-F32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"
- // REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
- // REDUCE-NEXT: [3141.6]
+ // REDUCE-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+ // REDUCE-F32-NEXT: [3141.6]
%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
- // REDUCE-NEXT: SVE: END OF TEST OUTPUT
+ // REDUCE-F32-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+func.func @reduce_1d_i32() {
+ // 1-D Tensor
+ %N = arith.constant 1000 : index
+ %c0_i32 = arith.constant 0 : i32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xi32>
+ %C_alloc = bufferization.alloc_tensor() : tensor<i32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3 : i32
+ %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?xi32>) -> tensor<?xi32>
+ %C_in = tensor.insert %c0_i32 into %C_alloc[] : tensor<i32>
+
+ // Reduce
+ %C_out = linalg.reduce ins(%A_in : tensor<?xi32>) outs(%C_in: tensor<i32>) dimensions = [0]
+ (%in: i32, %init: i32) {
+ %0 = arith.addi %in, %init : i32
+ linalg.yield %0 : i32
+ }
+
+ // Print and verify the output
+ // REDUCE-I32-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // REDUCE-I32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+ // REDUCE-I32-NEXT: [3000]
+
+ %xf = tensor.cast %C_out : tensor<i32> to tensor<*xi32>
+ call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+ // REDUCE-I32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"
return
@@ -60,7 +100,7 @@ func.func @generic_reduce_1d_f32() {
%C_alloc = bufferization.alloc_tensor() : tensor<f32>
// Initialise the tensors
- %pi = arith.constant 3.1416 : f32
+ %pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
@@ -76,16 +116,16 @@ func.func @generic_reduce_1d_f32() {
} -> tensor<f32>
// Print and verify the output
- // GENERIC-LABEL: SVE: START OF TEST OUTPUT
+ // GENERIC-F32-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"
- // GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
- // GENERIC-NEXT: [3141.6]
+ // GENERIC-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
+ // GENERIC-F32-NEXT: [3141.6]
%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
- // GENERIC-NEXT: SVE: END OF TEST OUTPUT
+ // GENERIC-F32-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"
return
@@ -132,3 +172,4 @@ module attributes {transform.with_named_sequence} {
}
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
index 63d0ac5126e66..bcfe12e374b4e 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir
@@ -24,7 +24,7 @@ func.func @reduce_2d_f32() {
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
// Initialise the tensors
- %pi = arith.constant 3.1416 : f32
+ %pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
@@ -62,7 +62,7 @@ func.func @generic_reduce_2d_f32() {
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
// Initialise the tensors
- %pi = arith.constant 3.1416 : f32
+ %pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
@@ -93,6 +93,49 @@ func.func @generic_reduce_2d_f32() {
return
}
+func.func @generic_reduce_2d_i32() {
+ // 2-D Tensor
+ %M = arith.constant 16 : index
+ %N = arith.constant 1000 : index
+ %c0_i32 = arith.constant 0 : i32
+
+ // Allocate the input and output tensors
+ %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32>
+ %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xi32>
+
+ // Initialise the tensors
+ %pi = arith.constant 3 : i32
+ %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?x?xi32>) -> tensor<?x?xi32>
+ %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?xi32>) -> tensor<?xi32>
+
+ // Reduce
+ %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"] }
+ ins(%A_in : tensor<?x?xi32>)
+ outs(%C_in : tensor<?xi32>) {
+ ^bb(%in: i32, %out: i32) :
+ %0 = arith.addi %in, %out : i32
+ linalg.yield %0 : i32
+ } -> tensor<?xi32>
+
+ // Print and verify the output
+ // GENERIC-I32-LABEL: SVE: START OF TEST OUTPUT
+ vector.print str "SVE: START OF TEST OUTPUT\n"
+
+ // GENERIC-I32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+ // GENERIC-I32-NEXT: [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000]
+
+ %xf = tensor.cast %C_out : tensor<?xi32> to tensor<*xi32>
+ call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+ // GENERIC-I32-NEXT: SVE: END OF TEST OUTPUT
+ vector.print str "SVE: END OF TEST OUTPUT\n"
+
+ return
+}
+
+
module attributes {transform.with_named_sequence} {
// A sequence that will tile and vectorise a Reduce Op
transform.named_sequence @tile_and_vectorize_reduce(%func
@@ -134,3 +177,4 @@ module attributes {transform.with_named_sequence} {
}
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
>From 00c683a9eca580f9b6c3726a00cdcf680a07717f Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Mon, 22 Jul 2024 15:57:10 -0700
Subject: [PATCH 5/7] Handle Matmul and Matvec's reduction dim on scalable
vectorization
In summary:
1. Do not allow scalable vectorization of the reduction dim of Matmul-like ops.
2. Allow scalable vectorization on only one dim of Matvec op.
Allowed combinations of scalable flags and iterator types:
Matmul:
Iterators: ["parallel", "parallel", "reduction"]
Scalable Flags: ["true", "true", "false"]
["false", "true", "false"]
Matvec:
Iterators: ["parallel", "reduction"]
Scalable Flags: ["false", "true"]
["true", "false"]
---
.../Linalg/Transforms/Vectorization.cpp | 15 ++++
.../Linalg/vectorization-scalable.mlir | 89 ++++++++++++++++++-
.../Linalg/vectorization-unsupported.mlir | 64 ++++++++++++-
3 files changed, 166 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index d17fae307e817..e0af708cb6d70 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2012,6 +2012,12 @@ vectorizeScalableVectorPrecondition(Operation *op,
"vectorization\n");
return failure();
}
+ if (isa<linalg::MatmulOp>(op) ||
+ isa<linalg::MatmulTransposeAOp>(op)) {
+ LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
+ "is not supported\n");
+ return failure();
+ }
break;
}
case utils::IteratorType::parallel: {
@@ -2030,6 +2036,14 @@ vectorizeScalableVectorPrecondition(Operation *op,
// * iterators = [parallel, parallel, reduction]
// * scalable flags = [true, true, false]
if (numOfScalableDims == 2) {
+ // Disallow below case which breaks 3. above:
+ // * iterators = [..., parallel, reduction]
+ // * scalable flags = [..., true, true]
+ if (iterators.back() == utils::IteratorType::reduction) {
+ LDBG("Higher dim than the trailing reduction dim requested for scalable "
+ "vectorization\n");
+ return failure();
+ }
scalableFlags.pop_back();
iterators.pop_back();
@@ -2043,6 +2057,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
+ isa<linalg::MatvecOp>(op) ||
hasReductionIterator(linalgOp));
}
diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
index df2f8d434f36b..4ee3088cc3778 100644
--- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir
@@ -230,7 +230,7 @@ module attributes {transform.with_named_sequence} {
// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
- %arg1: tensor<?xf32>) -> tensor<?xf32> {
+ %arg1: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
@@ -267,3 +267,90 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor<?x?xf32>,
+ %arg1: tensor<?xf32>,
+ %arg2: tensor<?xf32>) {
+ linalg.matvec ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
+ outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
+ return
+}
+
+// CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim(
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) {
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1>
+// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32>
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1>
+// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32>
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1>
+// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
+// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32>
+// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor<?x?xf32>,
+ %arg1: tensor<?xf32>,
+ %arg2: tensor<?xf32>) -> tensor<?xf32> {
+ %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"] }
+ ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>)
+ outs(%arg2 : tensor<?xf32>) {
+ ^bb(%mat: f32, %vec: f32, %res: f32) :
+ %0 = arith.mulf %mat, %vec : f32
+ %1 = arith.addf %res, %0 : f32
+ linalg.yield %1 : f32
+ } -> tensor<?xf32>
+ return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32>
+// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1>
+// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32>
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1>
+// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32>
+// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1>
+// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
+// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32>
+// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32>
+// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index
+// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32>
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op
+ transform.yield
+ }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index 6c4de1635028f..e9f8e08ca0c6b 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -177,10 +177,27 @@ module attributes {transform.with_named_sequence} {
// -----
+func.func @linalg_matvec_scalable_two_dims(%A: memref<?x?xf32>, %B: memref<?xf32>, %C: memref<?xf32>) {
+ // expected-error @+1 {{Attempted to vectorize, but failed}}
+ linalg.matvec ins(%A, %B: memref<?x?xf32>, memref<?xf32>)
+ outs(%C: memref<?xf32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %matmul = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %matmul vector_sizes [[4], [4]] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
// expected-error @+1 {{Attempted to vectorize, but failed}}
linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
- outs(%C: memref<?x?xf32>)
+ outs(%C: memref<?x?xf32>)
return
}
@@ -191,3 +208,48 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// -----
+
+func.func @linalg_matmul_scalable_trailing_reduction_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+ // expected-error @+1 {{Attempted to vectorize, but failed}}
+ linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+ outs(%C: memref<?x?xf32>)
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %matmul vector_sizes [8, 16, [4]] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @linalg_generic_matmul_scalable_two_trailing_dims(%A: tensor<?x64xf32>, %B: tensor<64x?xf32>,
+ %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+ // expected-error @+1 {{Attempted to vectorize, but failed}}
+ %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+ affine_map<(d0, d1, d2) -> (d2, d1)>,
+ affine_map<(d0, d1, d2) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel", "reduction"] }
+ ins(%A, %B : tensor<?x64xf32>, tensor<64x?xf32>)
+ outs(%C: tensor<?x?xf32>) {
+ ^bb(%in1: f32, %in2: f32, %out: f32) :
+ %0 = arith.mulf %in1, %in2 : f32
+ %1 = arith.addf %0, %out : f32
+ linalg.yield %1 : f32
+ } -> tensor<?x?xf32>
+ return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.structured.vectorize %0 vector_sizes [2, [4], [4]] : !transform.any_op
+ transform.yield
+ }
+}
>From 5a4ac6d67542c2a259ed3c7c9dcf8bf3c34dfbb9 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Mon, 22 Jul 2024 16:19:45 -0700
Subject: [PATCH 6/7] update per clang-format
---
mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e0af708cb6d70..d983b52a6c2dc 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2057,8 +2057,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
- isa<linalg::MatvecOp>(op) ||
- hasReductionIterator(linalgOp));
+ isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
}
LogicalResult mlir::linalg::vectorizeOpPrecondition(
>From 7c71012335f363662c76a135bd5c4d771be9b02f Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz at quicinc.com>
Date: Mon, 22 Jul 2024 16:36:46 -0700
Subject: [PATCH 7/7] 2nd update per clang-format
---
.../Linalg/Transforms/Vectorization.cpp | 44 +++++++++----------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index d983b52a6c2dc..165e5d1d0c59b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2005,30 +2005,30 @@ vectorizeScalableVectorPrecondition(Operation *op,
}
switch (iterators.back()) {
- case utils::IteratorType::reduction: {
- // Check 3. above is met.
- if (iterators.size() != inputVectorSizes.size()) {
- LDBG("Non-trailing reduction dim requested for scalable "
- "vectorization\n");
- return failure();
- }
- if (isa<linalg::MatmulOp>(op) ||
- isa<linalg::MatmulTransposeAOp>(op)) {
- LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
- "is not supported\n");
- return failure();
- }
- break;
+ case utils::IteratorType::reduction: {
+ // Check 3. above is met.
+ if (iterators.size() != inputVectorSizes.size()) {
+ LDBG("Non-trailing reduction dim requested for scalable "
+ "vectorization\n");
+ return failure();
}
- case utils::IteratorType::parallel: {
- // Check 1. and 2. above are met.
- if (seenParalell) {
- LDBG("Inner parallel dim not requested for scalable "
- "vectorization\n");
- return failure();
- }
- break;
+ if (isa<linalg::MatmulOp>(op) ||
+ isa<linalg::MatmulTransposeAOp>(op)) {
+ LDBG("Scalable vectorization of the reduction dim in Matmul-like ops "
+ "is not supported\n");
+ return failure();
+ }
+ break;
+ }
+ case utils::IteratorType::parallel: {
+ // Check 1. and 2. above are met.
+ if (seenParalell) {
+ LDBG("Inner parallel dim not requested for scalable "
+ "vectorization\n");
+ return failure();
}
+ break;
+ }
}
// If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
More information about the Mlir-commits
mailing list