[Mlir-commits] [mlir] [mlir][linalg] Restrict scalable vectorisation (PR #98639)

Fri Jul 12 07:30:54 PDT 2024

https://github.com/banach-space created https://github.com/llvm/llvm-project/pull/98639

Updates `vectorizeScalableVectorPrecondition` so that scalable
vectorisation is only applied in well understood and tested scenarios.

It's unlikely that we would ever want an arbitrary dimension to be
scalable. While the Linalg vectoriser should be flexible enough to
handle all possibilities:
  * in more "exotic" cases we are likely to struggle with lowerings
    further down the compilation stack,
  * it would be impractical given the limitations of LLVM (which usually
    reflect the limitations of actual hardware) - e.g. no support for
    "scalable" arrays of scalable or fixed width vectors (*).

Ultimately, the goal of this patch is to better document what's
currently supported. While this PR adds some new restrictions, no
existing tests are affected.

(*) At MLIR vector level that would correspond to e.g.
`vector<[[4]x8xf32>`.


>From 1130387989d6e93cb2e4dfd3b6acf2fce3a0cef8 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 12 Jul 2024 14:21:23 +0000
Subject: [PATCH] [mlir][linalg] Restrict scalable vectorisation

Updates `vectorizeScalableVectorPrecondition` so that scalable
vectorisation is only applied in well understood and tested scenarios.

It's unlikely that we would ever want an arbitrary dimension to be
scalable. While the Linalg vectoriser should be flexible enough to
handle all possibilities:
  * in more "exotic" cases we are likely to struggle with lowerings
    further down the compilation stack,
  * it would be impractical given the limitations of LLVM (which usually
    reflect the limitations of actual hardware) - e.g. no support for
    "scalable" arrays of scalable or fixed width vectors (*).

Ultimately, the goal of this patch is to better document what's
currently supported. While this PR adds some new restrictions, no
existing tests are affected.

(*) At MLIR vector level that would correspond to e.g.
`vector<[[4]x8xf32>`.
---
 .../Linalg/Transforms/Vectorization.cpp       | 73 ++++++++++++++++---
 .../Linalg/vectorization-unsupported.mlir     | 67 ++++++++++++++++-
 2 files changed, 130 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index a4c0508d0d8fa..9741120946362 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1936,7 +1936,8 @@ vectorizePadOpPrecondition(tensor::PadOp padOp,
   return success();
 }
 
-/// Preconditions for scalable vectors.
+/// Preconditions for scalable vectors. This is quite restrictive - it models
+/// the fact that in practice we would only make selected dimensions scalable.
 static LogicalResult
 vectorizeScalableVectorPrecondition(Operation *op,
                                     ArrayRef<int64_t> inputVectorSizes,
@@ -1944,18 +1945,72 @@ vectorizeScalableVectorPrecondition(Operation *op,
   assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
          "Number of input vector sizes and scalable dims doesn't match");
 
-  if (inputVectorSizes.empty())
-    return success();
+  size_t numOfScalableDims =
+      llvm::count_if(inputScalableVecDims, [](bool flag) { return flag; });
 
-  bool isScalable = inputScalableVecDims.back();
-  if (!isScalable)
+  if (numOfScalableDims == 0)
     return success();
 
-  // Only element-wise and 1d depthwise conv ops supported in the presence of
-  // scalable dims.
   auto linalgOp = dyn_cast<LinalgOp>(op);
-  return success(linalgOp && (isElementwise(linalgOp) ||
-                              isa<linalg::DepthwiseConv1DNwcWcOp>(op)));
+
+  // Cond 1: There's been no need for scalable vectorisation of
+  // non-linalg Ops so far
+  if (!linalgOp)
+    return failure();
+
+  // Cond 2: There's been no need for more than 2 scalable dims so far
+  if (numOfScalableDims > 2)
+    return failure();
+
+  // Cond 3: Look at the configuration in `inputScalableVecDims` and verify that
+  // it matches one of the supported cases:
+  //  1. exactly 1 dim is scalable and that's the _last_ parallel dim
+  //  2. exactly 2 dims are scalable and those are the _last two adjacent_
+  //     parallel dims
+  // The 2nd restriction above means that only Matmul-like Ops are supported
+  // when 2 dims are scalable, e.g. :
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+
+  // Find the first scalable flag
+  bool seenParalell = false;
+  auto iterators = linalgOp.getIteratorTypesArray();
+  SmallVector<bool> scalableFlags(inputScalableVecDims);
+  if (!scalableFlags.back()) {
+    while (!scalableFlags.back()) {
+      seenParalell |= (iterators.back() == utils::IteratorType::parallel);
+
+      iterators.pop_back();
+      scalableFlags.pop_back();
+    }
+  }
+
+  // TODO: Support scalable vectorisation for reduction dims
+  if (iterators.back() == utils::IteratorType::reduction)
+    return failure();
+
+  // If this is not the _last_ parallel dim, 1. above is not met
+  if (seenParalell)
+    return failure();
+
+  // If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
+  // supported for which expect the folowing config:
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+  if (numOfScalableDims == 2) {
+    scalableFlags.pop_back();
+    iterators.pop_back();
+
+    if (!scalableFlags.back() ||
+        (iterators.back() != utils::IteratorType::parallel))
+      return failure();
+  }
+
+  // Cond 4: Only element-wise and 1d depthwise conv ops supported in the
+  // presence of scalable vectors
+  return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
+                 isa<linalg::MatmulTransposeAOp>(op) ||
+                 isa<linalg::DepthwiseConv1DNwcWcOp>(op));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index 5d3c07c8e23c1..c7ec39b0dbfb3 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -110,7 +110,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-  // -----
+// -----
 
 func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> {
   %pad = arith.constant 0.000000e+00 : f32
@@ -126,3 +126,68 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
+                                  %acc: tensor<f32>) -> tensor<f32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
+  (%in: f32, %init: f32) {
+    %0 = arith.addf %in, %init : f32
+    linalg.yield %0 : f32
+  }
+  return %0 : tensor<f32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
+                                                 %acc: tensor<?xf32>) -> tensor<?xf32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d0)>],
+                        iterator_types = ["parallel", "reduction"] }
+    ins(%input : tensor<?x?xf32>)
+    outs(%acc : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+            outs(%C: memref<?x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [[8], 16, 4] : !transform.any_op
+    transform.yield
+  }
+}