[Mlir-commits] [mlir] [mlir][ArmSVE] Add `-arm-sve-legalize-vector-storage` pass (PR #68794)

Tue Oct 24 10:55:11 PDT 2023

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/68794

>From 2ba4626eafcc45244b46579bd3e9f130f57c4fa8 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 10 Oct 2023 18:53:05 +0000
Subject: [PATCH 1/5] [mlir][ArmSVE] Add `-arm-sve-legalize-vector-storage`
 pass

This patch adds a pass that ensures that loads, stores, and allocations
of SVE vector types will be legal in the LLVM backend. It does this at
the memref level, so this pass must be applied before lowering all the
way to LLVM.

This pass currently fixes two issues.

It is only legal to load/store predicate types equal to (or greater
than) a full predicate register, which in MLIR is `vector<[16]xi1>`.
Smaller predicate types (`vector<[1|2|4|8]xi1>`) must be converted
to/from a full predicate type (referred to as a `svbool`) before and
after storing and loading respectively. This pass does this by widening
allocations and inserting conversion intrinsics.

For example:

```mlir
%alloca = memref.alloca() : memref<vector<[4]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
%reload = memref.load %alloca[] : memref<vector<[4]xi1>>
```
Becomes:
```mlir
%alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
%svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
%reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
%reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
```

The storage for SVE vector types only needs to have an alignment that
matches the element type (for example 4 byte alignment for `f32`s).
However, the LLVM backend currently defaults to aligning to `base size`
x `element size` bytes. For non-legal vector types like
`vector<[8]xf32>` this results in 8 x 4 = 32-byte alignment, but the
backend only supports up to 16-byte alignment for SVE vectors on the
stack. Explicitly setting a smaller alignment prevents this issue.
---
 .../mlir/Dialect/ArmSVE/CMakeLists.txt        |   1 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   5 +
 .../mlir/Dialect/ArmSVE/Transforms/Passes.h   |  33 ++
 .../mlir/Dialect/ArmSVE/Transforms/Passes.td  |  67 ++++
 mlir/include/mlir/InitAllPasses.h             |   2 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   2 +
 .../Transforms/LegalizeVectorStorage.cpp      | 310 ++++++++++++++++++
 .../ArmSVE/legalize-vector-storage.mlir       | 160 +++++++++
 .../ArmSVE/arrays-of-scalable-vectors.mlir    | 121 +++++++
 9 files changed, 701 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
 create mode 100644 mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
 create mode 100644 mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir

diff --git a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
index f33061b2d87cffc..9f57627c321fb0c 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000000..7226642daf86172
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name ArmSVE)
+add_public_tablegen_target(MLIRArmSVEPassIncGen)
+
+add_mlir_doc(Passes ArmSVEPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
new file mode 100644
index 000000000000000..317fb9021b3c577
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::arm_sve {
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+/// Pass to legalize the types of mask stores.
+std::unique_ptr<Pass> createLegalizeVectorStoragePass();
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+} // namespace mlir::arm_sve
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
new file mode 100644
index 000000000000000..35c49607181da0c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
@@ -0,0 +1,67 @@
+//===-- Passes.td - ArmSVE pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+def LegalizeVectorStorage
+    : Pass<"arm-sve-legalize-vector-storage", "mlir::func::FuncOp"> {
+  let summary = "Ensures stores of SVE vector types will be legal";
+  let description = [{
+    This pass ensures that loads, stores, and allocations of SVE vector types
+    will be legal in the LLVM backend. It does this at the memref level, so this
+    pass must be applied before lowering all the way to LLVM.
+
+    This pass currently fixes two issues.
+
+    ## Loading and storing predicate types
+
+    It is only legal to load/store predicate types equal to (or greater than) a
+    full predicate register, which in MLIR is `vector<[16]xi1>`. Smaller
+    predicate types (`vector<[1|2|4|8]xi1>`) must be converted to/from a full
+    predicate type (referred to as a `svbool`) before and after storing and
+    loading respectively. This pass does this by widening allocations and
+    inserting conversion intrinsics.
+
+    For example:
+
+    ```mlir
+    %alloca = memref.alloca() : memref<vector<[4]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+    %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+    ```
+    Becomes:
+    ```mlir
+    %alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    %svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
+    memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
+    %reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
+    %reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
+    ```
+
+    ## Relax alignments for SVE vector allocas
+
+    The storage for SVE vector types only needs to have an alignment that
+    matches the element type (for example 4 byte alignment for `f32`s). However,
+    the LLVM backend currently defaults to aligning to `base size` x
+    `element size` bytes. For non-legal vector types like `vector<[8]xf32>` this
+    results in 8 x 4 = 32-byte alignment, but the backend only supports up to
+    16-byte alignment for SVE vectors on the stack. Explicitly setting a smaller
+    alignment prevents this issue.
+  }];
+  let constructor = "mlir::arm_sve::createLegalizeVectorStoragePass()";
+  let dependentDialects = ["func::FuncDialect",
+    "memref::MemRefDialect", "vector::VectorDialect",
+    "arm_sve::ArmSVEDialect"];
+}
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 5489a13a8040bdb..7301905954f56d8 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
 #include "mlir/Dialect/Async/Passes.h"
 #include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
@@ -82,6 +83,7 @@ inline void registerAllPasses() {
   transform::registerTransformPasses();
   vector::registerVectorPasses();
   arm_sme::registerArmSMEPasses();
+  arm_sve::registerArmSVEPasses();
 
   // Dialect pipelines
   bufferization::registerBufferizationPipelines();
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
index 2f1c43fae240d51..a70c489a51fea9a 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -1,8 +1,10 @@
 add_mlir_dialect_library(MLIRArmSVETransforms
   LegalizeForLLVMExport.cpp
+  LegalizeVectorStorage.cpp
 
   DEPENDS
   MLIRArmSVEConversionsIncGen
+  MLIRArmSVEPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRArmSVEDialect
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
new file mode 100644
index 000000000000000..610eb38089c4c88
--- /dev/null
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -0,0 +1,310 @@
+//===- LegalizeVectorStorage.cpp - Ensures SVE loads/stores are legal -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::arm_sve {
+#define GEN_PASS_DEF_LEGALIZEVECTORSTORAGE
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+} // namespace mlir::arm_sve
+
+using namespace mlir;
+using namespace mlir::arm_sve;
+
+constexpr StringLiteral kPassLabel("__arm_sve_legalize_vector_storage__");
+
+namespace {
+
+/// A (legal) SVE predicate mask that has a logical size, i.e. the number of
+/// bits match the number of lanes it masks (such as vector<[4]xi1>), but is too
+/// small to be stored to memory.
+bool isLogicalSVEPredicateType(VectorType type) {
+  return type.getRank() > 0 && type.getElementType().isInteger(1) &&
+         type.getScalableDims().back() && type.getShape().back() < 16 &&
+         llvm::isPowerOf2_32(type.getShape().back()) &&
+         !llvm::is_contained(type.getScalableDims().drop_back(), true);
+}
+
+VectorType widenScalableMaskTypeToSvbool(VectorType type) {
+  assert(isLogicalSVEPredicateType(type));
+  return VectorType::Builder(type).setDim(type.getRank() - 1, 16);
+}
+
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithLegalizedOp(PatternRewriter &rewriter, TOp op,
+                              TLegalizerCallback callback) {
+  // Clone the previous op to preserve any properties/attributes.
+  auto newOp = op.clone();
+  rewriter.insert(newOp);
+  rewriter.replaceOp(op, callback(newOp));
+}
+
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithUnrealizedConversion(PatternRewriter &rewriter, TOp op,
+                                       TLegalizerCallback callback) {
+  replaceOpWithLegalizedOp(rewriter, op, [&](TOp newOp) {
+    // Mark our `unrealized_conversion_casts` with a pass label.
+    return rewriter.create<UnrealizedConversionCastOp>(
+        op.getLoc(), TypeRange{op.getResult().getType()},
+        ValueRange{callback(newOp)},
+        NamedAttribute(rewriter.getStringAttr(kPassLabel),
+                       rewriter.getUnitAttr()));
+  });
+}
+
+/// Extracts the legal memref value from the `unrealized_conversion_casts` added
+/// by this pass.
+static FailureOr<Value> getLegalMemRef(Value illegalMemref) {
+  Operation *definingOp = illegalMemref.getDefiningOp();
+  if (!definingOp || !definingOp->hasAttr(kPassLabel))
+    return failure();
+  auto unrealizedConversion =
+      llvm::cast<UnrealizedConversionCastOp>(definingOp);
+  return unrealizedConversion.getOperand(0);
+}
+
+/// The default alignment of an alloca may request overaligned sizes for SVE
+/// types, which will fail during stack frame allocation. This rewrite
+/// explicitly adds a reasonable alignment to allocas of scalable types.
+struct RelaxScalableVectorAllocaAlignment
+    : public OpRewritePattern<memref::AllocaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::AllocaOp allocaOp,
+                                PatternRewriter &rewriter) const override {
+    auto elementType = allocaOp.getType().getElementType();
+    auto vectorType = llvm::dyn_cast<VectorType>(elementType);
+    if (!vectorType || !vectorType.isScalable() || allocaOp.getAlignment())
+      return failure();
+
+    unsigned elementByteSize =
+        vectorType.getElementType().getIntOrFloatBitWidth() / 8;
+
+    unsigned aligment = std::max(1u, elementByteSize);
+    allocaOp.setAlignment(aligment);
+
+    return success();
+  }
+};
+
+/// Replaces allocations of SVE predicates smaller than an svbool with a wider
+/// allocation and a tagged unrealized conversion.
+///
+/// Example
+/// ```
+/// %alloca = memref.alloca() : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[4]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+template <typename AllocLikeOp>
+struct LegalizeAllocLikeOpConversion : public OpRewritePattern<AllocLikeOp> {
+  using OpRewritePattern<AllocLikeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AllocLikeOp allocLikeOp,
+                                PatternRewriter &rewriter) const override {
+    auto vectorType =
+        llvm::dyn_cast<VectorType>(allocLikeOp.getType().getElementType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    // Replace this alloc-like op of an SVE mask with one of a (storable)
+    // svbool_t mask. A temporary unrealized_conversion_cast is added to the old
+    // type to allow local rewrites.
+    replaceOpWithUnrealizedConversion(
+        rewriter, allocLikeOp, [&](AllocLikeOp newAllocLikeOp) {
+          newAllocLikeOp.getResult().setType(
+              llvm::cast<MemRefType>(newAllocLikeOp.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newAllocLikeOp;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces vector.type_casts of unrealized conversions to illegal memref types
+/// with legal type casts, followed by unrealized conversions.
+///
+/// Example:
+/// ```
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// %cast = vector.type_cast %alloca
+///   : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened_cast = vector.type_cast %widened
+///   : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+/// %cast = builtin.unrealized_conversion_cast %widened_cast
+///   : memref<3xvector<[16]xi1>> to memref<3xvector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+struct LegalizeVectorTypeCastConversion
+    : public OpRewritePattern<vector::TypeCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TypeCastOp typeCastOp,
+                                PatternRewriter &rewriter) const override {
+    auto resultType = typeCastOp.getResultMemRefType();
+    auto vectorType = llvm::dyn_cast<VectorType>(resultType.getElementType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(typeCastOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    // Replace this vector.type_cast with one of a (storable) svbool_t mask.
+    replaceOpWithUnrealizedConversion(
+        rewriter, typeCastOp, [&](vector::TypeCastOp newTypeCast) {
+          newTypeCast.setOperand(*legalMemref);
+          newTypeCast.getResult().setType(
+              llvm::cast<MemRefType>(newTypeCast.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newTypeCast;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces stores to unrealized conversions to illegal memref types with
+/// `arm_sve.convert_to_svbool`s followed by (legal) wider stores.
+///
+/// Example:
+/// ```
+/// memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = arm_sve.convert_to_svbool %mask : vector<[8]xi1>
+/// memref.store %svbool, %widened[] : memref<vector<[16]xi1>>
+/// ```
+struct LegalizeMemrefStoreConversion
+    : public OpRewritePattern<memref::StoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::StoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = storeOp.getLoc();
+
+    Value valueToStore = storeOp.getValueToStore();
+    auto vectorType = llvm::dyn_cast<VectorType>(valueToStore.getType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(storeOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(
+        llvm::cast<VectorType>(valueToStore.getType()));
+    auto convertToSvbool = rewriter.create<arm_sve::ConvertToSvboolOp>(
+        loc, legalMaskType, valueToStore);
+    // Replace this store with a conversion to a storable svbool_t mask,
+    // followed by a wider store.
+    replaceOpWithLegalizedOp(rewriter, storeOp,
+                             [&](memref::StoreOp newStoreOp) {
+                               newStoreOp.setOperand(0, convertToSvbool);
+                               newStoreOp.setOperand(1, *legalMemref);
+                               return newStoreOp;
+                             });
+
+    return success();
+  }
+};
+
+/// Replaces loads from unrealized conversions to illegal memref types with
+/// (legal) wider loads, followed by `arm_sve.convert_from_svbool`s.
+///
+/// Example:
+/// ```
+/// %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = memref.load %widened[] : memref<vector<[16]xi1>>
+/// %reload = arm_sve.convert_from_svbool %reload : vector<[4]xi1>
+/// ```
+struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::LoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = loadOp.getLoc();
+
+    Value loadedMask = loadOp.getResult();
+    auto vectorType = llvm::dyn_cast<VectorType>(loadedMask.getType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(loadOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(vectorType);
+    // Replace this load with a legal load of an svbool_t type, followed by a
+    // conversion back to the original type.
+    replaceOpWithLegalizedOp(rewriter, loadOp, [&](memref::LoadOp newLoadOp) {
+      newLoadOp.setMemRef(*legalMemref);
+      newLoadOp.getResult().setType(legalMaskType);
+      return rewriter.create<arm_sve::ConvertFromSvboolOp>(
+          loc, loadedMask.getType(), newLoadOp);
+    });
+
+    return success();
+  }
+};
+
+struct LegalizeVectorStorage
+    : public arm_sve::impl::LegalizeVectorStorageBase<LegalizeVectorStorage> {
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<RelaxScalableVectorAllocaAlignment,
+                 LegalizeAllocLikeOpConversion<memref::AllocaOp>,
+                 LegalizeAllocLikeOpConversion<memref::AllocOp>,
+                 LegalizeVectorTypeCastConversion,
+                 LegalizeMemrefStoreConversion, LegalizeMemrefLoadConversion>(
+        patterns.getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+    ConversionTarget target(getContext());
+    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+        [](UnrealizedConversionCastOp unrealizedConversion) {
+          return !unrealizedConversion->hasAttr(kPassLabel);
+        });
+    // This detects if we failed to completely legalize the IR.
+    if (failed(applyPartialConversion(getOperation(), target, {})))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::arm_sve::createLegalizeVectorStoragePass() {
+  return std::make_unique<LegalizeVectorStorage>();
+}
diff --git a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
new file mode 100644
index 000000000000000..fda8e2e0fab9618
--- /dev/null
+++ b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
@@ -0,0 +1,160 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -arm-sve-legalize-vector-storage -split-input-file -verify-diagnostics | FileCheck %s
+
+/// This tests the basic functionality of the -arm-sve-legalize-vector-storage pass.
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv1i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[1]xi1>)
+func.func @store_and_reload_sve_predicate_nxv1i1(%mask: vector<[1]xi1>) -> vector<[1]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[1]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[1]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[1]xi1>
+  return %reload : vector<[1]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv2i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[2]xi1>)
+func.func @store_and_reload_sve_predicate_nxv2i1(%mask: vector<[2]xi1>) -> vector<[2]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[2]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[2]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[2]xi1>
+  return %reload : vector<[2]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv4i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[4]xi1>)
+func.func @store_and_reload_sve_predicate_nxv4i1(%mask: vector<[4]xi1>) -> vector<[4]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[4]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[4]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[4]xi1>
+  return %reload : vector<[4]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv8i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[8]xi1>)
+func.func @store_and_reload_sve_predicate_nxv8i1(%mask: vector<[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[8]xi1>
+  return %reload : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_2d_mask_and_reload_slice(
+// CHECK-SAME:                                  %[[MASK:.*]]: vector<3x[8]xi1>)
+func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<3x[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<3x[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<3x[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[UNPACK:.*]] = vector.type_cast %[[ALLOCA]] : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+  %unpack = vector.type_cast %alloca : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[UNPACK]][%[[C0]]] : memref<3xvector<[16]xi1>>
+  // CHECK-NEXT: %[[SLICE:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %slice = memref.load %unpack[%c0] : memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: return %[[SLICE]] : vector<[8]xi1>
+  return %slice : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @set_sve_alloca_alignment
+func.func @set_sve_alloca_alignment() {
+  // CHECK-COUNT-6: alignment = 1
+  %a1 = memref.alloca() : memref<vector<[32]xi8>>
+  %a2 = memref.alloca() : memref<vector<[16]xi8>>
+  %a3 = memref.alloca() : memref<vector<[8]xi8>>
+  %a4 = memref.alloca() : memref<vector<[4]xi8>>
+  %a5 = memref.alloca() : memref<vector<[2]xi8>>
+  %a6 = memref.alloca() : memref<vector<[1]xi8>>
+
+  // CHECK-COUNT-6: alignment = 2
+  %b1 = memref.alloca() : memref<vector<[32]xi16>>
+  %b2 = memref.alloca() : memref<vector<[16]xi16>>
+  %b3 = memref.alloca() : memref<vector<[8]xi16>>
+  %b4 = memref.alloca() : memref<vector<[4]xi16>>
+  %b5 = memref.alloca() : memref<vector<[2]xi16>>
+  %b6 = memref.alloca() : memref<vector<[1]xi16>>
+
+  // CHECK-COUNT-6: alignment = 4
+  %c1 = memref.alloca() : memref<vector<[32]xi32>>
+  %c2 = memref.alloca() : memref<vector<[16]xi32>>
+  %c3 = memref.alloca() : memref<vector<[8]xi32>>
+  %c4 = memref.alloca() : memref<vector<[4]xi32>>
+  %c5 = memref.alloca() : memref<vector<[2]xi32>>
+  %c6 = memref.alloca() : memref<vector<[1]xi32>>
+
+  // CHECK-COUNT-6: alignment = 8
+  %d1 = memref.alloca() : memref<vector<[32]xi64>>
+  %d2 = memref.alloca() : memref<vector<[16]xi64>>
+  %d3 = memref.alloca() : memref<vector<[8]xi64>>
+  %d4 = memref.alloca() : memref<vector<[4]xi64>>
+  %d5 = memref.alloca() : memref<vector<[2]xi64>>
+  %d6 = memref.alloca() : memref<vector<[1]xi64>>
+
+  // CHECK-COUNT-6: alignment = 4
+  %e1 = memref.alloca() : memref<vector<[32]xf32>>
+  %e2 = memref.alloca() : memref<vector<[16]xf32>>
+  %e3 = memref.alloca() : memref<vector<[8]xf32>>
+  %e4 = memref.alloca() : memref<vector<[4]xf32>>
+  %e5 = memref.alloca() : memref<vector<[2]xf32>>
+  %e6 = memref.alloca() : memref<vector<[1]xf32>>
+
+  // CHECK-COUNT-6: alignment = 8
+  %f1 = memref.alloca() : memref<vector<[32]xf64>>
+  %f2 = memref.alloca() : memref<vector<[16]xf64>>
+  %f3 = memref.alloca() : memref<vector<[8]xf64>>
+  %f4 = memref.alloca() : memref<vector<[4]xf64>>
+  %f5 = memref.alloca() : memref<vector<[2]xf64>>
+  %f6 = memref.alloca() : memref<vector<[1]xf64>>
+
+  "prevent.dce"(
+    %a1, %a2, %a3, %a4, %a5, %a6,
+    %b1, %b2, %b3, %b4, %b5, %b6,
+    %c1, %c2, %c3, %c4, %c5, %c6,
+    %d1, %d2, %d3, %d4, %d5, %d6,
+    %e1, %e2, %e3, %e4, %e5, %e6,
+    %f1, %f2, %f3, %f4, %f5, %f6)
+    : (memref<vector<[32]xi8>>, memref<vector<[16]xi8>>, memref<vector<[8]xi8>>, memref<vector<[4]xi8>>, memref<vector<[2]xi8>>, memref<vector<[1]xi8>>,
+       memref<vector<[32]xi16>>, memref<vector<[16]xi16>>, memref<vector<[8]xi16>>, memref<vector<[4]xi16>>, memref<vector<[2]xi16>>, memref<vector<[1]xi16>>,
+       memref<vector<[32]xi32>>, memref<vector<[16]xi32>>, memref<vector<[8]xi32>>, memref<vector<[4]xi32>>, memref<vector<[2]xi32>>, memref<vector<[1]xi32>>,
+       memref<vector<[32]xi64>>, memref<vector<[16]xi64>>, memref<vector<[8]xi64>>, memref<vector<[4]xi64>>, memref<vector<[2]xi64>>, memref<vector<[1]xi64>>,
+       memref<vector<[32]xf32>>, memref<vector<[16]xf32>>, memref<vector<[8]xf32>>, memref<vector<[4]xf32>>, memref<vector<[2]xf32>>, memref<vector<[1]xf32>>,
+       memref<vector<[32]xf64>>, memref<vector<[16]xf64>>, memref<vector<[8]xf64>>, memref<vector<[4]xf64>>, memref<vector<[2]xf64>>, memref<vector<[1]xf64>>) -> ()
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
new file mode 100644
index 000000000000000..260cd1eabe7dae1
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -0,0 +1,121 @@
+// RUN: mlir-opt %s -convert-vector-to-scf -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+/// This tests basic functionality of arrays of scalable vectors, which in MLIR
+/// are vectors with a single trailing scalable dimension. This test requires
+/// the -arm-sve-legalize-vector-storage pass to ensure the loads/stores done
+/// here are be legal for the LLVM backend.
+
+func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim = memref.dim %memref, %c1 : memref<3x?xf32>
+  %mask = vector.create_mask %c2, %dim : vector<3x[8]xi1>
+  %vector = vector.transfer_read %memref[%c0,%c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[8]xf32>
+
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %row0 = vector.extract %vector[0] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row1 = vector.extract %vector[1] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row2 = vector.extract %vector[2] : vector<[8]xf32> from vector<3x[8]xf32>
+
+  /// Print each of the vectors. This only checks the first eight elements (which
+  /// works for all vscale >= 1).
+
+  // CHECK-LABEL: TEST 1
+  vector.print str "TEST 1 (print and read 2D arrays of scalable vectors)"
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row0 : vector<[8]xf32>
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row1 : vector<[8]xf32>
+  /// This last row is all zero due to our mask.
+  // CHECK: ( 0, 0, 0, 0, 0, 0, 0, 0
+  vector.print %row2 : vector<[8]xf32>
+
+  return
+}
+
+func.func @print_1x2xVSCALExf32(%vector: vector<1x2x[4]xf32>) {
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %slice0 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  %slice1 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  vector.print %slice0 : vector<[4]xf32>
+  vector.print %slice1 : vector<[4]xf32>
+  return
+}
+
+func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 2 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
+  %dim_b = memref.dim %b, %c2 : memref<1x2x?xf32>
+  %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
+  %mask_b = vector.create_mask %c2, %c3, %dim_b : vector<1x2x[4]xi1>
+
+  vector.print str "TEST 2 (reading and adding two 3D arrays of scalable vectors)"
+
+  /// Print each of the vectors. This only checks the first four elements (which
+  /// works for all vscale >= 1).
+
+  // CHECK-LABEL: Vector A
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  vector.print str "\nVector A"
+  %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
+
+  vector.print str "\nVector B"
+  // CHECK-LABEL: Vector B
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
+
+  %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
+  // CHECK-LABEL: Sum
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  vector.print str "\nSum"
+  func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
+
+  return
+}
+
+func.func @entry() {
+  %vscale = vector.vscale
+
+  %c4 = arith.constant 4 : index
+  %c8 = arith.constant 8 : index
+  %f32_8 = arith.constant 8.0 : f32
+  %f32_5 = arith.constant 5.0 : f32
+  %f32_4 = arith.constant 4.0 : f32
+
+  vector.print str "\n====================\n"
+
+  %test_1_memref_size = arith.muli %vscale, %c8 : index
+  %test_1_memref = memref.alloca(%test_1_memref_size) : memref<3x?xf32>
+
+  linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
+
+  func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
+
+  vector.print str "\n====================\n"
+
+  %test_2_memref_size = arith.muli %vscale, %c4 : index
+  %test_2_memref_a = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+  %test_2_memref_b = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+
+  linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
+  linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
+
+  func.call @add_arrays_of_scalable_vectors(
+    %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
+
+  vector.print str "\n====================\n"
+
+  return
+}

>From 72f19279dcd0a95eb21dc7f72e4d5850569361f8 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 23 Oct 2023 12:59:03 +0000
Subject: [PATCH 2/5] Always align SVE vectors to 16 bytes and predicates to 2
 bytes

---
 .../Transforms/LegalizeVectorStorage.cpp      | 10 ++++-----
 .../ArmSVE/legalize-vector-storage.mlir       | 22 +++++++++----------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
index 610eb38089c4c88..e7edcf6ec789235 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -82,15 +82,13 @@ struct RelaxScalableVectorAllocaAlignment
 
   LogicalResult matchAndRewrite(memref::AllocaOp allocaOp,
                                 PatternRewriter &rewriter) const override {
-    auto elementType = allocaOp.getType().getElementType();
-    auto vectorType = llvm::dyn_cast<VectorType>(elementType);
+    auto memrefElementType = allocaOp.getType().getElementType();
+    auto vectorType = llvm::dyn_cast<VectorType>(memrefElementType);
     if (!vectorType || !vectorType.isScalable() || allocaOp.getAlignment())
       return failure();
 
-    unsigned elementByteSize =
-        vectorType.getElementType().getIntOrFloatBitWidth() / 8;
-
-    unsigned aligment = std::max(1u, elementByteSize);
+    // Set alignment based on the defaults for SVE vectors and predicates.
+    unsigned aligment = vectorType.getElementType().isInteger(1) ? 2 : 16;
     allocaOp.setAlignment(aligment);
 
     return success();
diff --git a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
index fda8e2e0fab9618..61879c48712f4d2 100644
--- a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
+++ b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: @store_and_reload_sve_predicate_nxv1i1(
 // CHECK-SAME:                                         %[[MASK:.*]]: vector<[1]xi1>)
 func.func @store_and_reload_sve_predicate_nxv1i1(%mask: vector<[1]xi1>) -> vector<[1]xi1> {
-  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
   %alloca = memref.alloca() : memref<vector<[1]xi1>>
   // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[1]xi1>
   // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
@@ -24,7 +24,7 @@ func.func @store_and_reload_sve_predicate_nxv1i1(%mask: vector<[1]xi1>) -> vecto
 // CHECK-LABEL: @store_and_reload_sve_predicate_nxv2i1(
 // CHECK-SAME:                                         %[[MASK:.*]]: vector<[2]xi1>)
 func.func @store_and_reload_sve_predicate_nxv2i1(%mask: vector<[2]xi1>) -> vector<[2]xi1> {
-  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
   %alloca = memref.alloca() : memref<vector<[2]xi1>>
   // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[2]xi1>
   // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
@@ -41,7 +41,7 @@ func.func @store_and_reload_sve_predicate_nxv2i1(%mask: vector<[2]xi1>) -> vecto
 // CHECK-LABEL: @store_and_reload_sve_predicate_nxv4i1(
 // CHECK-SAME:                                         %[[MASK:.*]]: vector<[4]xi1>)
 func.func @store_and_reload_sve_predicate_nxv4i1(%mask: vector<[4]xi1>) -> vector<[4]xi1> {
-  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
   %alloca = memref.alloca() : memref<vector<[4]xi1>>
   // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[4]xi1>
   // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
@@ -58,7 +58,7 @@ func.func @store_and_reload_sve_predicate_nxv4i1(%mask: vector<[4]xi1>) -> vecto
 // CHECK-LABEL: @store_and_reload_sve_predicate_nxv8i1(
 // CHECK-SAME:                                         %[[MASK:.*]]: vector<[8]xi1>)
 func.func @store_and_reload_sve_predicate_nxv8i1(%mask: vector<[8]xi1>) -> vector<[8]xi1> {
-  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
   %alloca = memref.alloca() : memref<vector<[8]xi1>>
   // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[8]xi1>
   // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
@@ -77,7 +77,7 @@ func.func @store_and_reload_sve_predicate_nxv8i1(%mask: vector<[8]xi1>) -> vecto
 func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]xi1> {
   // CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
-  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<3x[16]xi1>>
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<3x[16]xi1>>
   %alloca = memref.alloca() : memref<vector<3x[8]xi1>>
   // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<3x[8]xi1>
   // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<3x[16]xi1>>
@@ -95,7 +95,7 @@ func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]
 
 // CHECK-LABEL: @set_sve_alloca_alignment
 func.func @set_sve_alloca_alignment() {
-  // CHECK-COUNT-6: alignment = 1
+  // CHECK-COUNT-6: alignment = 16
   %a1 = memref.alloca() : memref<vector<[32]xi8>>
   %a2 = memref.alloca() : memref<vector<[16]xi8>>
   %a3 = memref.alloca() : memref<vector<[8]xi8>>
@@ -103,7 +103,7 @@ func.func @set_sve_alloca_alignment() {
   %a5 = memref.alloca() : memref<vector<[2]xi8>>
   %a6 = memref.alloca() : memref<vector<[1]xi8>>
 
-  // CHECK-COUNT-6: alignment = 2
+  // CHECK-COUNT-6: alignment = 16
   %b1 = memref.alloca() : memref<vector<[32]xi16>>
   %b2 = memref.alloca() : memref<vector<[16]xi16>>
   %b3 = memref.alloca() : memref<vector<[8]xi16>>
@@ -111,7 +111,7 @@ func.func @set_sve_alloca_alignment() {
   %b5 = memref.alloca() : memref<vector<[2]xi16>>
   %b6 = memref.alloca() : memref<vector<[1]xi16>>
 
-  // CHECK-COUNT-6: alignment = 4
+  // CHECK-COUNT-6: alignment = 16
   %c1 = memref.alloca() : memref<vector<[32]xi32>>
   %c2 = memref.alloca() : memref<vector<[16]xi32>>
   %c3 = memref.alloca() : memref<vector<[8]xi32>>
@@ -119,7 +119,7 @@ func.func @set_sve_alloca_alignment() {
   %c5 = memref.alloca() : memref<vector<[2]xi32>>
   %c6 = memref.alloca() : memref<vector<[1]xi32>>
 
-  // CHECK-COUNT-6: alignment = 8
+  // CHECK-COUNT-6: alignment = 16
   %d1 = memref.alloca() : memref<vector<[32]xi64>>
   %d2 = memref.alloca() : memref<vector<[16]xi64>>
   %d3 = memref.alloca() : memref<vector<[8]xi64>>
@@ -127,7 +127,7 @@ func.func @set_sve_alloca_alignment() {
   %d5 = memref.alloca() : memref<vector<[2]xi64>>
   %d6 = memref.alloca() : memref<vector<[1]xi64>>
 
-  // CHECK-COUNT-6: alignment = 4
+  // CHECK-COUNT-6: alignment = 16
   %e1 = memref.alloca() : memref<vector<[32]xf32>>
   %e2 = memref.alloca() : memref<vector<[16]xf32>>
   %e3 = memref.alloca() : memref<vector<[8]xf32>>
@@ -135,7 +135,7 @@ func.func @set_sve_alloca_alignment() {
   %e5 = memref.alloca() : memref<vector<[2]xf32>>
   %e6 = memref.alloca() : memref<vector<[1]xf32>>
 
-  // CHECK-COUNT-6: alignment = 8
+  // CHECK-COUNT-6: alignment = 16
   %f1 = memref.alloca() : memref<vector<[32]xf64>>
   %f2 = memref.alloca() : memref<vector<[16]xf64>>
   %f3 = memref.alloca() : memref<vector<[8]xf64>>

>From 8ed3c15fc4b118d73676de0f4f1adc358f5d0702 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 24 Oct 2023 14:08:06 +0000
Subject: [PATCH 3/5] Add
 mlir::arm_sve::populateLegalizeVectorStoragePatterns() function

This will be needed in IREE.
---
 .../mlir/Dialect/ArmSVE/Transforms/Passes.h   |  5 ++++-
 .../Transforms/LegalizeVectorStorage.cpp      | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
index 317fb9021b3c577..66f30a67cb05b8a 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
@@ -17,9 +17,12 @@ namespace mlir::arm_sve {
 #define GEN_PASS_DECL
 #include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
 
-/// Pass to legalize the types of mask stores.
+/// Pass to legalize Arm SVE vector storage.
 std::unique_ptr<Pass> createLegalizeVectorStoragePass();
 
+/// Collect a set of patterns to legalize Arm SVE vector storage.
+void populateLegalizeVectorStoragePatterns(RewritePatternSet &patterns);
+
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
index e7edcf6ec789235..840f27db7e64276 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -275,17 +275,24 @@ struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
   }
 };
 
+} // namespace
+
+void mlir::arm_sve::populateLegalizeVectorStoragePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<RelaxScalableVectorAllocaAlignment,
+               LegalizeAllocLikeOpConversion<memref::AllocaOp>,
+               LegalizeAllocLikeOpConversion<memref::AllocOp>,
+               LegalizeVectorTypeCastConversion, LegalizeMemrefStoreConversion,
+               LegalizeMemrefLoadConversion>(patterns.getContext());
+}
+
+namespace {
 struct LegalizeVectorStorage
     : public arm_sve::impl::LegalizeVectorStorageBase<LegalizeVectorStorage> {
 
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    patterns.add<RelaxScalableVectorAllocaAlignment,
-                 LegalizeAllocLikeOpConversion<memref::AllocaOp>,
-                 LegalizeAllocLikeOpConversion<memref::AllocOp>,
-                 LegalizeVectorTypeCastConversion,
-                 LegalizeMemrefStoreConversion, LegalizeMemrefLoadConversion>(
-        patterns.getContext());
+    populateLegalizeVectorStoragePatterns(patterns);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       signalPassFailure();

>From 304aadfa0bea00c062632436b0b1e9aa5b60cec5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 24 Oct 2023 17:15:06 +0000
Subject: [PATCH 4/5] Some fixups

---
 .../mlir/Dialect/ArmSVE/Transforms/Passes.td  |  3 +-
 .../Transforms/LegalizeVectorStorage.cpp      | 67 +++++++++++--------
 .../ArmSVE/legalize-vector-storage.mlir       |  9 +++
 .../ArmSVE/arrays-of-scalable-vectors.mlir    | 24 +++----
 4 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
index 35c49607181da0c..19d16b214a5d9f0 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
@@ -28,7 +28,8 @@ def LegalizeVectorStorage
     predicate types (`vector<[1|2|4|8]xi1>`) must be converted to/from a full
     predicate type (referred to as a `svbool`) before and after storing and
     loading respectively. This pass does this by widening allocations and
-    inserting conversion intrinsics.
+    inserting conversion intrinsics. Note: Non-powers-of-two masks (e.g.
+    `vector<[7]xi1>`), which are not SVE predicates, are ignored.
 
     For example:
 
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
index 840f27db7e64276..9d34afd3458a3b3 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -21,14 +21,16 @@ namespace mlir::arm_sve {
 using namespace mlir;
 using namespace mlir::arm_sve;
 
-constexpr StringLiteral kPassLabel("__arm_sve_legalize_vector_storage__");
+// A tag to mark unrealized_conversions produced by this pass. This is used to
+// detect IR this pass failed to completely legalize, and report an error.
+constexpr StringLiteral kSVELegalizerTag("__arm_sve_legalize_vector_storage__");
 
 namespace {
 
 /// A (legal) SVE predicate mask that has a logical size, i.e. the number of
 /// bits match the number of lanes it masks (such as vector<[4]xi1>), but is too
 /// small to be stored to memory.
-bool isLogicalSVEPredicateType(VectorType type) {
+bool isLogicalSVEMaskType(VectorType type) {
   return type.getRank() > 0 && type.getElementType().isInteger(1) &&
          type.getScalableDims().back() && type.getShape().back() < 16 &&
          llvm::isPowerOf2_32(type.getShape().back()) &&
@@ -36,7 +38,7 @@ bool isLogicalSVEPredicateType(VectorType type) {
 }
 
 VectorType widenScalableMaskTypeToSvbool(VectorType type) {
-  assert(isLogicalSVEPredicateType(type));
+  assert(isLogicalSVEMaskType(type));
   return VectorType::Builder(type).setDim(type.getRank() - 1, 16);
 }
 
@@ -57,16 +59,16 @@ void replaceOpWithUnrealizedConversion(PatternRewriter &rewriter, TOp op,
     return rewriter.create<UnrealizedConversionCastOp>(
         op.getLoc(), TypeRange{op.getResult().getType()},
         ValueRange{callback(newOp)},
-        NamedAttribute(rewriter.getStringAttr(kPassLabel),
+        NamedAttribute(rewriter.getStringAttr(kSVELegalizerTag),
                        rewriter.getUnitAttr()));
   });
 }
 
-/// Extracts the legal memref value from the `unrealized_conversion_casts` added
-/// by this pass.
-static FailureOr<Value> getLegalMemRef(Value illegalMemref) {
+/// Extracts the legal SVE memref value from the `unrealized_conversion_casts`
+/// added by this pass.
+static FailureOr<Value> getSVELegalizedMemref(Value illegalMemref) {
   Operation *definingOp = illegalMemref.getDefiningOp();
-  if (!definingOp || !definingOp->hasAttr(kPassLabel))
+  if (!definingOp || !definingOp->hasAttr(kSVELegalizerTag))
     return failure();
   auto unrealizedConversion =
       llvm::cast<UnrealizedConversionCastOp>(definingOp);
@@ -95,8 +97,13 @@ struct RelaxScalableVectorAllocaAlignment
   }
 };
 
-/// Replaces allocations of SVE predicates smaller than an svbool with a wider
-/// allocation and a tagged unrealized conversion.
+/// Replaces allocations of SVE predicates smaller than an svbool_t (illegal)
+/// with a wider allocation of svbool_t (legal) followed by a tagged unrealized
+/// conversion to the original type.
+///
+/// svbool = vector<...x[16]xi1>, which maps to some multiple of full SVE
+/// predicate registers. A full predicate is the smallest quantity that can be
+/// loaded/stored.
 ///
 /// Example
 /// ```
@@ -110,7 +117,7 @@ struct RelaxScalableVectorAllocaAlignment
 ///     {__arm_sve_legalize_vector_storage__}
 /// ```
 template <typename AllocLikeOp>
-struct LegalizeAllocLikeOpConversion : public OpRewritePattern<AllocLikeOp> {
+struct LegalizeSVEMaskAllocation : public OpRewritePattern<AllocLikeOp> {
   using OpRewritePattern<AllocLikeOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(AllocLikeOp allocLikeOp,
@@ -118,7 +125,7 @@ struct LegalizeAllocLikeOpConversion : public OpRewritePattern<AllocLikeOp> {
     auto vectorType =
         llvm::dyn_cast<VectorType>(allocLikeOp.getType().getElementType());
 
-    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+    if (!vectorType || !isLogicalSVEMaskType(vectorType))
       return failure();
 
     // Replace this alloc-like op of an SVE mask with one of a (storable)
@@ -155,7 +162,7 @@ struct LegalizeAllocLikeOpConversion : public OpRewritePattern<AllocLikeOp> {
 ///   : memref<3xvector<[16]xi1>> to memref<3xvector<[8]xi1>>
 ///     {__arm_sve_legalize_vector_storage__}
 /// ```
-struct LegalizeVectorTypeCastConversion
+struct LegalizeSVEMaskTypeCastConversion
     : public OpRewritePattern<vector::TypeCastOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -164,10 +171,10 @@ struct LegalizeVectorTypeCastConversion
     auto resultType = typeCastOp.getResultMemRefType();
     auto vectorType = llvm::dyn_cast<VectorType>(resultType.getElementType());
 
-    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+    if (!vectorType || !isLogicalSVEMaskType(vectorType))
       return failure();
 
-    auto legalMemref = getLegalMemRef(typeCastOp.getMemref());
+    auto legalMemref = getSVELegalizedMemref(typeCastOp.getMemref());
     if (failed(legalMemref))
       return failure();
 
@@ -185,8 +192,9 @@ struct LegalizeVectorTypeCastConversion
   }
 };
 
-/// Replaces stores to unrealized conversions to illegal memref types with
-/// `arm_sve.convert_to_svbool`s followed by (legal) wider stores.
+/// Replaces stores to unrealized conversions to illegal SVE predicate memref
+/// (!= svbool_t) types with `arm_sve.convert_to_svbool`s followed by (legal)
+/// wider stores.
 ///
 /// Example:
 /// ```
@@ -208,10 +216,10 @@ struct LegalizeMemrefStoreConversion
     Value valueToStore = storeOp.getValueToStore();
     auto vectorType = llvm::dyn_cast<VectorType>(valueToStore.getType());
 
-    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+    if (!vectorType || !isLogicalSVEMaskType(vectorType))
       return failure();
 
-    auto legalMemref = getLegalMemRef(storeOp.getMemref());
+    auto legalMemref = getSVELegalizedMemref(storeOp.getMemref());
     if (failed(legalMemref))
       return failure();
 
@@ -232,8 +240,9 @@ struct LegalizeMemrefStoreConversion
   }
 };
 
-/// Replaces loads from unrealized conversions to illegal memref types with
-/// (legal) wider loads, followed by `arm_sve.convert_from_svbool`s.
+/// Replaces loads from unrealized conversions to illegal SVE predicate memref
+/// (!= svbool_t) types with (legal) wider loads, followed by
+/// `arm_sve.convert_from_svbool`s.
 ///
 /// Example:
 /// ```
@@ -244,7 +253,7 @@ struct LegalizeMemrefStoreConversion
 /// %svbool = memref.load %widened[] : memref<vector<[16]xi1>>
 /// %reload = arm_sve.convert_from_svbool %reload : vector<[4]xi1>
 /// ```
-struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
+struct LegalizeSVEMaskLoadConversion : public OpRewritePattern<memref::LoadOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(memref::LoadOp loadOp,
@@ -254,10 +263,10 @@ struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
     Value loadedMask = loadOp.getResult();
     auto vectorType = llvm::dyn_cast<VectorType>(loadedMask.getType());
 
-    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+    if (!vectorType || !isLogicalSVEMaskType(vectorType))
       return failure();
 
-    auto legalMemref = getLegalMemRef(loadOp.getMemref());
+    auto legalMemref = getSVELegalizedMemref(loadOp.getMemref());
     if (failed(legalMemref))
       return failure();
 
@@ -280,10 +289,10 @@ struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
 void mlir::arm_sve::populateLegalizeVectorStoragePatterns(
     RewritePatternSet &patterns) {
   patterns.add<RelaxScalableVectorAllocaAlignment,
-               LegalizeAllocLikeOpConversion<memref::AllocaOp>,
-               LegalizeAllocLikeOpConversion<memref::AllocOp>,
-               LegalizeVectorTypeCastConversion, LegalizeMemrefStoreConversion,
-               LegalizeMemrefLoadConversion>(patterns.getContext());
+               LegalizeSVEMaskAllocation<memref::AllocaOp>,
+               LegalizeSVEMaskAllocation<memref::AllocOp>,
+               LegalizeSVEMaskTypeCastConversion, LegalizeMemrefStoreConversion,
+               LegalizeSVEMaskLoadConversion>(patterns.getContext());
 }
 
 namespace {
@@ -300,7 +309,7 @@ struct LegalizeVectorStorage
     ConversionTarget target(getContext());
     target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
         [](UnrealizedConversionCastOp unrealizedConversion) {
-          return !unrealizedConversion->hasAttr(kPassLabel);
+          return !unrealizedConversion->hasAttr(kSVELegalizerTag);
         });
     // This detects if we failed to completely legalize the IR.
     if (failed(applyPartialConversion(getOperation(), target, {})))
diff --git a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
index 61879c48712f4d2..a2c39e3e72b43b2 100644
--- a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
+++ b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
@@ -95,6 +95,15 @@ func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]
 
 // CHECK-LABEL: @set_sve_alloca_alignment
 func.func @set_sve_alloca_alignment() {
+  /// This checks the alignment of alloca's of scalable vectors will be
+  /// something the backend can handle. Currently, the backend sets the
+  /// alignment of scalable vectors to their base size (i.e. their size at
+  /// vscale = 1). This works for hardware-sized types, which always get a
+  /// 16-byte alignment. The problem is larger types e.g. vector<[8]xf32> end up
+  /// with alignments larger than 16-bytes (e.g. 32-bytes here), which are
+  /// unsupported. This pass avoids this issue by explicitly setting the
+  /// alignment to 16-bytes for all scalable vectors.
+
   // CHECK-COUNT-6: alignment = 16
   %a1 = memref.alloca() : memref<vector<[32]xi8>>
   %a2 = memref.alloca() : memref<vector<[16]xi8>>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
index 260cd1eabe7dae1..c486bf0de5d3528 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -21,11 +21,11 @@ func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
   %row1 = vector.extract %vector[1] : vector<[8]xf32> from vector<3x[8]xf32>
   %row2 = vector.extract %vector[2] : vector<[8]xf32> from vector<3x[8]xf32>
 
-  /// Print each of the vectors. This only checks the first eight elements (which
-  /// works for all vscale >= 1).
+  /// Print each of the vectors.
+  /// vscale is >= 1, so at least 8 elements will be printed.
 
-  // CHECK-LABEL: TEST 1
-  vector.print str "TEST 1 (print and read 2D arrays of scalable vectors)"
+  vector.print str "read_and_print_2d_vector()"
+  // CHECK-LABEL: read_and_print_2d_vector()
   // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
   vector.print %row0 : vector<[8]xf32>
   // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
@@ -56,10 +56,8 @@ func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x
   %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
   %mask_b = vector.create_mask %c2, %c3, %dim_b : vector<1x2x[4]xi1>
 
-  vector.print str "TEST 2 (reading and adding two 3D arrays of scalable vectors)"
-
-  /// Print each of the vectors. This only checks the first four elements (which
-  /// works for all vscale >= 1).
+  /// Print each of the vectors.
+  /// vscale is >= 1, so at least 4 elements will be printed.
 
   // CHECK-LABEL: Vector A
   // CHECK-NEXT: ( 5, 5, 5, 5
@@ -68,18 +66,18 @@ func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x
   %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
 
-  vector.print str "\nVector B"
   // CHECK-LABEL: Vector B
   // CHECK-NEXT: ( 4, 4, 4, 4
   // CHECK-NEXT: ( 4, 4, 4, 4
+  vector.print str "\nVector B"
   %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
 
-  %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
   // CHECK-LABEL: Sum
   // CHECK-NEXT: ( 9, 9, 9, 9
   // CHECK-NEXT: ( 9, 9, 9, 9
   vector.print str "\nSum"
+  %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
 
   return
@@ -94,13 +92,12 @@ func.func @entry() {
   %f32_5 = arith.constant 5.0 : f32
   %f32_4 = arith.constant 4.0 : f32
 
-  vector.print str "\n====================\n"
-
   %test_1_memref_size = arith.muli %vscale, %c8 : index
   %test_1_memref = memref.alloca(%test_1_memref_size) : memref<3x?xf32>
 
   linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
 
+  vector.print str "=> Print and read 2D arrays of scalable vectors:"
   func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
 
   vector.print str "\n====================\n"
@@ -112,10 +109,9 @@ func.func @entry() {
   linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
   linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
 
+  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:"
   func.call @add_arrays_of_scalable_vectors(
     %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
 
-  vector.print str "\n====================\n"
-
   return
 }

>From ac20ef310ecb73a343673c81ce4dae8e8f59346b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 24 Oct 2023 17:54:01 +0000
Subject: [PATCH 5/5] Fixups

---
 .../Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
index 9d34afd3458a3b3..4327f437d55a36d 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -23,6 +23,8 @@ using namespace mlir::arm_sve;
 
 // A tag to mark unrealized_conversions produced by this pass. This is used to
 // detect IR this pass failed to completely legalize, and report an error.
+// If everything was successfully legalized, no tagged ops will remain after
+// this pass.
 constexpr StringLiteral kSVELegalizerTag("__arm_sve_legalize_vector_storage__");
 
 namespace {
@@ -205,7 +207,7 @@ struct LegalizeSVEMaskTypeCastConversion
 /// %svbool = arm_sve.convert_to_svbool %mask : vector<[8]xi1>
 /// memref.store %svbool, %widened[] : memref<vector<[16]xi1>>
 /// ```
-struct LegalizeMemrefStoreConversion
+struct LegalizeSVEMaskStoreConversion
     : public OpRewritePattern<memref::StoreOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -291,8 +293,9 @@ void mlir::arm_sve::populateLegalizeVectorStoragePatterns(
   patterns.add<RelaxScalableVectorAllocaAlignment,
                LegalizeSVEMaskAllocation<memref::AllocaOp>,
                LegalizeSVEMaskAllocation<memref::AllocOp>,
-               LegalizeSVEMaskTypeCastConversion, LegalizeMemrefStoreConversion,
-               LegalizeSVEMaskLoadConversion>(patterns.getContext());
+               LegalizeSVEMaskTypeCastConversion,
+               LegalizeSVEMaskStoreConversion, LegalizeSVEMaskLoadConversion>(
+      patterns.getContext());
 }
 
 namespace {