[flang-commits] [flang] [mlir][ArmSVE] Add `-arm-sve-legalize-vector-storage` pass (PR #68794)

Wed Oct 11 04:52:57 PDT 2023

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/68794

This patch adds a pass that ensures that loads, stores, and allocations of SVE vector types will be legal in the LLVM backend. It does this at the memref level, so this pass must be applied before lowering all the way to LLVM.

This pass currently fixes two issues.

## Loading and storing predicate types

It is only legal to load/store predicate types equal to (or greater than) a full predicate register, which in MLIR is `vector<[16]xi1>`. Smaller predicate types (`vector<[1|2|4|8]xi1>`) must be converted to/from a full predicate type (referred to as a `svbool`) before and after storing and loading respectively. This pass does this by widening allocations and inserting conversion intrinsics.

For example:


```mlir
%alloca = memref.alloca() : memref<vector<[4]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
%reload = memref.load %alloca[] : memref<vector<[4]xi1>>
```
Becomes:
```mlir
%alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
%svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
%reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
%reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
```

## Relax alignments for SVE vector allocas

The storage for SVE vector types only needs to have an alignment that matches the element type (for example 4 byte alignment for `f32`s). However, the LLVM backend currently defaults to aligning to `base size x element size` bytes. For non-legal vector types like `vector<[8]xf32>` this results in 8 x 4 = 32-byte alignment, but the backend only supports up to 16-byte alignment for SVE vectors on the stack. Explicitly setting a smaller alignment prevents this issue.

Depends on: #68586 and #68695 (for testing)

>From e521859869a75e9ff7117c2a37b3d1cb55ef7654 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 10 Oct 2023 11:11:33 +0000
Subject: [PATCH 1/4] [mlir][ODS] Omit printing default-valued attributes in
 oilists

This makes these match the behaviour of optional attributes (which are
omitted when they are their default value of `none`). This allows for
concise assembly formats without a custom printer.

An extra print of " " is also removed, this does change any existing
uses of oilists, but if the parameter before the oilist is optional,
that would previously add an extra space.
---
 flang/test/Lower/OpenMP/FIR/atomic-read.f90   |  2 +-
 flang/test/Lower/OpenMP/FIR/critical.f90      |  2 +-
 flang/test/Lower/OpenMP/critical.f90          |  2 +-
 .../OpenMPToLLVM/convert-to-llvmir.mlir       |  4 +--
 mlir/test/Dialect/OpenMP/ops.mlir             | 10 ++++----
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        | 25 +++++++++++++------
 6 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/flang/test/Lower/OpenMP/FIR/atomic-read.f90 b/flang/test/Lower/OpenMP/FIR/atomic-read.f90
index ff2b651953f2abc..0079b347fac8de6 100644
--- a/flang/test/Lower/OpenMP/FIR/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/FIR/atomic-read.f90
@@ -14,7 +14,7 @@
 !CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
 !CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
 !CHECK: omp.atomic.read %[[VAR_X]] = %[[VAR_Y]] memory_order(acquire)  hint(uncontended) : !fir.ref<i32>, i32
-!CHECK: omp.atomic.read %[[VAR_A]] = %[[VAR_B]] memory_order(relaxed) hint(none)  : !fir.ref<!fir.char<1>>, !fir.char<1>
+!CHECK: omp.atomic.read %[[VAR_A]] = %[[VAR_B]] memory_order(relaxed) : !fir.ref<!fir.char<1>>, !fir.char<1>
 !CHECK: omp.atomic.read %[[VAR_C]] = %[[VAR_D]] memory_order(seq_cst)  hint(contended) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
 !CHECK: omp.atomic.read %[[VAR_E]] = %[[VAR_F]] hint(speculative) : !fir.ref<!fir.char<1,8>>, !fir.char<1,8>
 !CHECK: omp.atomic.read %[[VAR_G]] = %[[VAR_H]] hint(nonspeculative) : !fir.ref<f32>, f32
diff --git a/flang/test/Lower/OpenMP/FIR/critical.f90 b/flang/test/Lower/OpenMP/FIR/critical.f90
index c6ac818fe21aa6e..b86729f8a98e370 100644
--- a/flang/test/Lower/OpenMP/FIR/critical.f90
+++ b/flang/test/Lower/OpenMP/FIR/critical.f90
@@ -2,7 +2,7 @@
 !RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefix="OMPDialect"
 !RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | tco | FileCheck %s --check-prefix="LLVMIR"
 
-!OMPDialect: omp.critical.declare @help2 hint(none)
+!OMPDialect: omp.critical.declare @help2
 !OMPDialect: omp.critical.declare @help1 hint(contended)
 
 subroutine omp_critical()
diff --git a/flang/test/Lower/OpenMP/critical.f90 b/flang/test/Lower/OpenMP/critical.f90
index 9fbd172df96421c..5a4d2e4815df49e 100644
--- a/flang/test/Lower/OpenMP/critical.f90
+++ b/flang/test/Lower/OpenMP/critical.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: omp.critical.declare @help2 hint(none)
+!CHECK: omp.critical.declare @help2
 !CHECK: omp.critical.declare @help1 hint(contended)
 
 subroutine omp_critical()
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 1df27dd9957e594..881d738b413ef15 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -90,7 +90,7 @@ func.func @wsloop(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4:
 // CHECK-LABEL: @atomic_write
 // CHECK: (%[[ARG0:.*]]: !llvm.ptr<i32>)
 // CHECK: %[[VAL0:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK: omp.atomic.write %[[ARG0]] = %[[VAL0]] hint(none) memory_order(relaxed) : !llvm.ptr<i32>, i32
+// CHECK: omp.atomic.write %[[ARG0]] = %[[VAL0]] memory_order(relaxed) : !llvm.ptr<i32>, i32
 func.func @atomic_write(%a: !llvm.ptr<i32>) -> () {
   %1 = arith.constant 1 : i32
   omp.atomic.write %a = %1 hint(none) memory_order(relaxed) : !llvm.ptr<i32>, i32
@@ -474,4 +474,4 @@ llvm.func @_QPtarget_map_with_bounds(%arg0: !llvm.ptr<i32>, %arg1: !llvm.ptr<arr
     omp.terminator
   }
   llvm.return
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 13cbea6c9923c22..27c31824c0506df 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -831,7 +831,7 @@ omp.critical.declare @mutex6 hint(contended, nonspeculative)
 omp.critical.declare @mutex7 hint(uncontended, speculative)
 // CHECK: omp.critical.declare @mutex8 hint(contended, speculative)
 omp.critical.declare @mutex8 hint(contended, speculative)
-// CHECK: omp.critical.declare @mutex9 hint(none)
+// CHECK: omp.critical.declare @mutex9
 omp.critical.declare @mutex9 hint(none)
 // CHECK: omp.critical.declare @mutex10
 omp.critical.declare @mutex10
@@ -909,7 +909,7 @@ func.func @omp_atomic_read(%v: memref<i32>, %x: memref<i32>) {
   omp.atomic.read %v = %x hint(nonspeculative, contended) : memref<i32>, i32
   // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) hint(contended, speculative) : memref<i32>, i32
   omp.atomic.read %v = %x hint(speculative, contended) memory_order(seq_cst) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) hint(none) : memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) : memref<i32>, i32
   omp.atomic.read %v = %x hint(none) memory_order(seq_cst) : memref<i32>, i32
   return
 }
@@ -927,7 +927,7 @@ func.func @omp_atomic_write(%addr : memref<i32>, %val : i32) {
   omp.atomic.write %addr = %val memory_order(relaxed) : memref<i32>, i32
   // CHECK: omp.atomic.write %[[ADDR]] = %[[VAL]] hint(uncontended, speculative) : memref<i32>, i32
   omp.atomic.write %addr = %val hint(speculative, uncontended) : memref<i32>, i32
-  // CHECK: omp.atomic.write %[[ADDR]] = %[[VAL]] hint(none) : memref<i32>, i32
+  // CHECK: omp.atomic.write %[[ADDR]] = %[[VAL]] : memref<i32>, i32
   omp.atomic.write %addr = %val hint(none) : memref<i32>, i32
   return
 }
@@ -1004,7 +1004,7 @@ func.func @omp_atomic_update(%x : memref<i32>, %expr : i32, %xBool : memref<i1>,
     omp.yield(%const:i32)
   }
 
-  // CHECK: omp.atomic.update hint(none) %[[X]] : memref<i32>
+  // CHECK: omp.atomic.update %[[X]] : memref<i32>
   // CHECK-NEXT: (%[[XVAL:.*]]: i32):
   // CHECK-NEXT:   %[[NEWVAL:.*]] = llvm.add %[[XVAL]], %[[EXPR]] : i32
   // CHECK-NEXT:   omp.yield(%[[NEWVAL]] : i32)
@@ -1181,7 +1181,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
     omp.atomic.write %x = %expr : memref<i32>, i32
   }
 
-  // CHECK: omp.atomic.capture hint(none) {
+  // CHECK: omp.atomic.capture {
   // CHECK-NEXT: omp.atomic.update %[[x]] : memref<i32>
   // CHECK-NEXT: (%[[xval:.*]]: i32):
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index bdb97866a47fc9d..240780b0abf1156 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -2009,6 +2009,15 @@ static void genEnumAttrPrinter(const NamedAttribute *var, const Operator &op,
           "  }\n";
 }
 
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    AttributeVariable &attrElement) {
+  FmtContext fctx;
+  Attribute attr = attrElement.getVar()->attr;
+  fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
+  body << " && " << op.getGetterName(attrElement.getVar()->name) << "Attr() != "
+       << tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
+}
+
 /// Generate the check for the anchor of an optional group.
 static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
                                           const Operator &op,
@@ -2042,12 +2051,7 @@ static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
         if (attr.hasDefaultValue()) {
           // Consider a default-valued attribute as present if it's not the
           // default value.
-          FmtContext fctx;
-          fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
-          body << " && " << op.getGetterName(element->getVar()->name)
-               << "Attr() != "
-               << tgfmt(attr.getConstBuilderTemplate(), &fctx,
-                        attr.getDefaultValue());
+          genNonDefaultValueCheck(body, op, *element);
           return;
         }
         llvm_unreachable("attribute must be optional or default-valued");
@@ -2158,7 +2162,6 @@ void OperationFormat::genElementPrinter(FormatElement *element,
 
   // Emit the OIList
   if (auto *oilist = dyn_cast<OIListElement>(element)) {
-    genLiteralPrinter(" ", body, shouldEmitSpace, lastWasPunctuation);
     for (auto clause : oilist->getClauses()) {
       LiteralElement *lelement = std::get<0>(clause);
       ArrayRef<FormatElement *> pelement = std::get<1>(clause);
@@ -2170,8 +2173,14 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       for (VariableElement *var : vars) {
         TypeSwitch<FormatElement *>(var)
             .Case([&](AttributeVariable *attrEle) {
-              body << " || " << op.getGetterName(attrEle->getVar()->name)
+              body << " || (" << op.getGetterName(attrEle->getVar()->name)
                    << "Attr()";
+              Attribute attr = attrEle->getVar()->attr;
+              if (attr.hasDefaultValue()) {
+                // Don't print default-valued attributes.
+                genNonDefaultValueCheck(body, op, *attrEle);
+              }
+              body << ")";
             })
             .Case([&](OperandVariable *ele) {
               if (ele->getVar()->isVariadic()) {

>From 6d96a5527d59c9262701ea98bee223488a84bb37 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 10 Oct 2023 11:38:12 +0000
Subject: [PATCH 2/4] [mlir][VectorOps] Support string literals in
 `vector.print`

Printing strings within integration tests is currently quite annoyingly
verbose, and can't be tucked into shared helpers as the types depend on
the length of the string:

```
llvm.mlir.global internal constant @hello_world("Hello, World!\0")

func.func @entry() {
  %0 = llvm.mlir.addressof @hello_world : !llvm.ptr<array<14 x i8>>
  %1 = llvm.mlir.constant(0 : index) : i64
  %2 = llvm.getelementptr %0[%1, %1]
    : (!llvm.ptr<array<14 x i8>>, i64, i64) -> !llvm.ptr<i8>
  llvm.call @printCString(%2) : (!llvm.ptr<i8>) -> ()
  return
}
``

So this patch adds a simple extension to `vector.print` to simplify
this:
```
func.func @entry() {
   // Print a vector of characters ;)
   vector.print str "Hello, World!"
   return
}
```

Most of the logic for this is now shared with `cf.assert` which already
does something similar.
---
 .../Conversion/LLVMCommon/PrintCallHelper.h   | 36 ++++++++++
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 37 +++++++++--
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   | 49 +-------------
 mlir/lib/Conversion/LLVMCommon/CMakeLists.txt |  1 +
 .../Conversion/LLVMCommon/PrintCallHelper.cpp | 66 +++++++++++++++++++
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  6 +-
 .../VectorToLLVM/vector-to-llvm.mlir          | 14 ++++
 .../Dialect/Vector/CPU/test-hello-world.mlir  | 10 +++
 8 files changed, 168 insertions(+), 51 deletions(-)
 create mode 100644 mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
 create mode 100644 mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/test-hello-world.mlir

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
new file mode 100644
index 000000000000000..7e26858589f2756
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
@@ -0,0 +1,36 @@
+
+//===- PrintCallHelper.h - LLVM Interfaces ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_PRINTCALLHELPER_H_
+#define MLIR_DIALECT_LLVMIR_PRINTCALLHELPER_H_
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+
+class Location;
+class ModuleOp;
+class OpBuilder;
+class Operation;
+class Type;
+class ValueRange;
+class LLVMTypeConverter;
+
+namespace LLVM {
+
+/// Generate IR that prints the given string to stdout.
+void createPrintStrCall(OpBuilder &builder, Location loc, ModuleOp moduleOp,
+                        StringRef symbolName, StringRef string,
+                        const LLVMTypeConverter &typeConverter);
+} // namespace LLVM
+
+} // namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 2df2fe4c5ce8e9c..2b60055ca9db94b 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -26,6 +26,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/VectorInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/IR/BuiltinAttributes.td"
 
 // TODO: Add an attribute to specify a different algebra with operators other
 // than the current set: {*, +}.
@@ -2476,12 +2477,18 @@ def Vector_TransposeOp :
 }
 
 def Vector_PrintOp :
-  Vector_Op<"print", []>,
+  Vector_Op<"print", [
+    PredOpTrait<
+      "`source` or `punctuation` are not set printing strings",
+      CPred<"!getStringLiteral() || (!getSource() && getPunctuation() == PrintPunctuation::NewLine)">
+    >,
+  ]>,
   Arguments<(ins Optional<Type<Or<[
     AnyVectorOfAnyRank.predicate,
     AnyInteger.predicate, Index.predicate, AnyFloat.predicate
   ]>>>:$source, DefaultValuedAttr<Vector_PrintPunctuation,
-                      "::mlir::vector::PrintPunctuation::NewLine">:$punctuation)
+                      "::mlir::vector::PrintPunctuation::NewLine">:$punctuation,
+                OptionalAttr<Builtin_StringAttr>:$stringLiteral)
   > {
   let summary = "print operation (for testing and debugging)";
   let description = [{
@@ -2520,6 +2527,13 @@ def Vector_PrintOp :
     ```mlir
     vector.print punctuation <newline>
     ```
+
+    Additionally, to aid with debugging and testing `vector.print` can also
+    print constant strings:
+
+    ```mlir
+    vector.print str "Hello, World!"
+    ```
   }];
   let extraClassDeclaration = [{
     Type getPrintType() {
@@ -2528,11 +2542,26 @@ def Vector_PrintOp :
   }];
   let builders = [
     OpBuilder<(ins "PrintPunctuation":$punctuation), [{
-      build($_builder, $_state, {}, punctuation);
+      build($_builder, $_state, {}, punctuation, {});
+    }]>,
+    OpBuilder<(ins "::mlir::Value":$source), [{
+      build($_builder, $_state, source, PrintPunctuation::NewLine);
+    }]>,
+    OpBuilder<(ins "::mlir::Value":$source, "PrintPunctuation":$punctuation), [{
+      build($_builder, $_state, source, punctuation, {});
+    }]>,
+    OpBuilder<(ins "::llvm::StringRef":$string), [{
+      build($_builder, $_state, {}, PrintPunctuation::NewLine, $_builder.getStringAttr(string));
     }]>,
   ];
 
-  let assemblyFormat = "($source^ `:` type($source))? (`punctuation` $punctuation^)? attr-dict";
+  let assemblyFormat = [{
+      ($source^ `:` type($source))?
+        oilist(
+            `str` $stringLiteral
+          | `punctuation` $punctuation)
+        attr-dict
+    }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index a4f146bbe475cc6..6b7647b038f1d94 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/PrintCallHelper.h"
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -36,51 +37,6 @@ using namespace mlir;
 
 #define PASS_NAME "convert-cf-to-llvm"
 
-static std::string generateGlobalMsgSymbolName(ModuleOp moduleOp) {
-  std::string prefix = "assert_msg_";
-  int counter = 0;
-  while (moduleOp.lookupSymbol(prefix + std::to_string(counter)))
-    ++counter;
-  return prefix + std::to_string(counter);
-}
-
-/// Generate IR that prints the given string to stderr.
-static void createPrintMsg(OpBuilder &builder, Location loc, ModuleOp moduleOp,
-                           StringRef msg,
-                           const LLVMTypeConverter &typeConverter) {
-  auto ip = builder.saveInsertionPoint();
-  builder.setInsertionPointToStart(moduleOp.getBody());
-  MLIRContext *ctx = builder.getContext();
-
-  // Create a zero-terminated byte representation and allocate global symbol.
-  SmallVector<uint8_t> elementVals;
-  elementVals.append(msg.begin(), msg.end());
-  elementVals.push_back(0);
-  auto dataAttrType = RankedTensorType::get(
-      {static_cast<int64_t>(elementVals.size())}, builder.getI8Type());
-  auto dataAttr =
-      DenseElementsAttr::get(dataAttrType, llvm::ArrayRef(elementVals));
-  auto arrayTy =
-      LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
-  std::string symbolName = generateGlobalMsgSymbolName(moduleOp);
-  auto globalOp = builder.create<LLVM::GlobalOp>(
-      loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private, symbolName,
-      dataAttr);
-
-  // Emit call to `printStr` in runtime library.
-  builder.restoreInsertionPoint(ip);
-  auto msgAddr = builder.create<LLVM::AddressOfOp>(
-      loc, typeConverter.getPointerType(arrayTy), globalOp.getName());
-  SmallVector<LLVM::GEPArg> indices(1, 0);
-  Value gep = builder.create<LLVM::GEPOp>(
-      loc, typeConverter.getPointerType(builder.getI8Type()), arrayTy, msgAddr,
-      indices);
-  Operation *printer = LLVM::lookupOrCreatePrintStrFn(
-      moduleOp, typeConverter.useOpaquePointers());
-  builder.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(printer),
-                               gep);
-}
-
 namespace {
 /// Lower `cf.assert`. The default lowering calls the `abort` function if the
 /// assertion is violated and has no effect otherwise. The failure message is
@@ -105,7 +61,8 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
 
     // Failed block: Generate IR to print the message and call `abort`.
     Block *failureBlock = rewriter.createBlock(opBlock->getParent());
-    createPrintMsg(rewriter, loc, module, op.getMsg(), *getTypeConverter());
+    LLVM::createPrintStrCall(rewriter, loc, module, "assert_msg", op.getMsg(),
+                             *getTypeConverter());
     if (abortOnFailedAssert) {
       // Insert the `abort` declaration if necessary.
       auto abortFunc = module.lookupSymbol<LLVM::LLVMFuncOp>("abort");
diff --git a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
index 091cd539f0ae014..568d9339aaabcb4 100644
--- a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_conversion_library(MLIRLLVMCommonConversion
   LoweringOptions.cpp
   MemRefBuilder.cpp
   Pattern.cpp
+  PrintCallHelper.cpp
   StructBuilder.cpp
   TypeConverter.cpp
   VectorPattern.cpp
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
new file mode 100644
index 000000000000000..487abb435d10ad7
--- /dev/null
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -0,0 +1,66 @@
+
+//===- PrintCallHelper.cpp - LLVM Interfaces --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LLVMCommon/PrintCallHelper.h"
+#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/ArrayRef.h"
+
+using namespace mlir;
+using namespace llvm;
+
+static std::string ensureSymbolNameIsUnique(ModuleOp moduleOp,
+                                            StringRef symbolName) {
+  static int counter = 0;
+  std::string uniqueName = std::string(symbolName);
+  while (moduleOp.lookupSymbol(uniqueName)) {
+    uniqueName = std::string(symbolName) + "_" + std::to_string(counter++);
+  }
+  return uniqueName;
+}
+
+void mlir::LLVM::createPrintStrCall(OpBuilder &builder, Location loc,
+                                    ModuleOp moduleOp, StringRef symbolName,
+                                    StringRef string,
+                                    const LLVMTypeConverter &typeConverter) {
+  auto ip = builder.saveInsertionPoint();
+  builder.setInsertionPointToStart(moduleOp.getBody());
+  MLIRContext *ctx = builder.getContext();
+
+  // Create a zero-terminated byte representation and allocate global symbol.
+  SmallVector<uint8_t> elementVals;
+  elementVals.append(string.begin(), string.end());
+  elementVals.push_back(0);
+  auto dataAttrType = RankedTensorType::get(
+      {static_cast<int64_t>(elementVals.size())}, builder.getI8Type());
+  auto dataAttr =
+      DenseElementsAttr::get(dataAttrType, llvm::ArrayRef(elementVals));
+  auto arrayTy =
+      LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
+  auto globalOp = builder.create<LLVM::GlobalOp>(
+      loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
+      ensureSymbolNameIsUnique(moduleOp, symbolName), dataAttr);
+
+  // Emit call to `printStr` in runtime library.
+  builder.restoreInsertionPoint(ip);
+  auto msgAddr = builder.create<LLVM::AddressOfOp>(
+      loc, typeConverter.getPointerType(arrayTy), globalOp.getName());
+  SmallVector<LLVM::GEPArg> indices(1, 0);
+  Value gep = builder.create<LLVM::GEPOp>(
+      loc, typeConverter.getPointerType(builder.getI8Type()), arrayTy, msgAddr,
+      indices);
+  Operation *printer = LLVM::lookupOrCreatePrintStrFn(
+      moduleOp, typeConverter.useOpaquePointers());
+  builder.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(printer),
+                               gep);
+}
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 8427d60f14c0bcc..4af58653c8227ae 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
+#include "mlir/Conversion/LLVMCommon/PrintCallHelper.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -1548,7 +1549,10 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
     }
 
     auto punct = printOp.getPunctuation();
-    if (punct != PrintPunctuation::NoPunctuation) {
+    if (auto stringLiteral = printOp.getStringLiteral()) {
+      LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
+                               *stringLiteral, *getTypeConverter());
+    } else if (punct != PrintPunctuation::NoPunctuation) {
       emitCall(rewriter, printOp->getLoc(), [&] {
         switch (punct) {
         case PrintPunctuation::Close:
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 9aa4d735681f576..65b3a78e295f0c4 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1068,6 +1068,20 @@ func.func @vector_print_scalar_f64(%arg0: f64) {
 
 // -----
 
+// CHECK-LABEL: module {
+// CHECK: llvm.func @puts(!llvm.ptr)
+// CHECK: llvm.mlir.global private constant @[[GLOBAL_STR:.*]](dense<[72, 101, 108, 108, 111, 44, 32, 87, 111, 114, 108, 100, 33, 0]> : tensor<14xi8>) {addr_space = 0 : i32} : !llvm.array<14 x i8>
+// CHECK: @vector_print_string
+//       CHECK-NEXT: %[[GLOBAL_ADDR:.*]] = llvm.mlir.addressof @[[GLOBAL_STR]] : !llvm.ptr
+//       CHECK-NEXT: %[[STR_PTR:.*]] = llvm.getelementptr %[[GLOBAL_ADDR]][0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<14 x i8>
+//       CHECK-NEXT: llvm.call @puts(%[[STR_PTR]]) : (!llvm.ptr) -> ()
+func.func @vector_print_string() {
+  vector.print str "Hello, World!"
+  return
+}
+
+// -----
+
 func.func @extract_strided_slice1(%arg0: vector<4xf32>) -> vector<2xf32> {
   %0 = vector.extract_strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
   return %0 : vector<2xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-hello-world.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-hello-world.mlir
new file mode 100644
index 000000000000000..c4076e65151ac72
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-hello-world.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt %s -test-lower-to-llvm | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
+// RUN:   -shared-libs=%mlir_c_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @entry() {
+   // CHECK: Hello, World!
+   vector.print str "Hello, World!"
+   return
+}

>From e8323512d8960d1ccddf046400448bffe87a7d87 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 6 Oct 2023 14:40:00 +0000
Subject: [PATCH 3/4] [mlir][ArmSVE] Add convert_to/from_svbool ops

This adds slightly higher-level ops for converting masks between svbool
and SVE predicate types. The main reason to use these over the
intrinsics is these ops support vectors of masks (via unrolling).

E.g.

```
// Convert a svbool mask to a mask of SVE predicates:
%svbool = vector.load %memref[%c0, %c0]
                       : memref<2x?xi1>, vector<2x[16]xi1>
%mask = arm_sve.convert_from_svbool %svbool : vector<2x[8]xi1>
// => Results in vector<2x[8]xi1>
```
Or:
```
// Convert a mask of SVE predicates to a svbool mask:
%mask = vector.create_mask %c2, %dim_size : vector<2x[2]xi1>
%svbool = arm_sve.convert_to_svbool %mask : vector<2x[2]xi1>
// => Results in vector<2x[16]xi1>
```
---
 mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td | 67 +++++++++++++++++
 mlir/include/mlir/IR/CommonTypeConstraints.td | 43 +++++++++++
 mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp  |  1 +
 mlir/lib/Dialect/ArmSVE/IR/CMakeLists.txt     |  1 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |  1 +
 .../Transforms/LegalizeForLLVMExport.cpp      | 62 +++++++++++++++-
 mlir/test/Dialect/ArmSVE/invalid.mlir         | 51 +++++++++++++
 .../Dialect/ArmSVE/legalize-for-llvm.mlir     | 73 ++++++++++++++++++-
 mlir/test/Dialect/ArmSVE/roundtrip.mlir       | 49 ++++++++++++-
 9 files changed, 343 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Dialect/ArmSVE/invalid.mlir

diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
index d4294b4dd9fd4e8..fa7f6d080d5a91c 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
@@ -28,6 +28,8 @@ def ArmSVE_Dialect : Dialect {
     This dialect contains the definitions necessary to target specific Arm SVE
     scalable vector operations.
   }];
+
+  let dependentDialects = ["vector::VectorDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -40,6 +42,11 @@ def SVBool : ScalableVectorOfRankAndLengthAndType<
 def SVEPredicate : ScalableVectorOfRankAndLengthAndType<
   [1], [16, 8, 4, 2, 1], [I1]>;
 
+// Generalizations of SVBool and SVEPredicate to ranks >= 1.
+// These are masks with a single trailing scalable dimension.
+def SVBoolMask : TrailingScalableVectorOfSizeAndType<[16], [I1]>;
+def SVEMask : TrailingScalableVectorOfSizeAndType<[16, 8, 4, 2, 1], [I1]>;
+
 //===----------------------------------------------------------------------===//
 // ArmSVE op definitions
 //===----------------------------------------------------------------------===//
@@ -236,6 +243,66 @@ def UmmlaOp : ArmSVE_Op<"ummla",
     "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)";
 }
 
+
+class SvboolTypeContraint<string lhsArg, string rhsArg> : TypesMatchWith<
+      "expected corresponding svbool type widened to [16]xi1",
+      lhsArg, rhsArg,
+      "VectorType(VectorType::Builder(::llvm::cast<VectorType>($_self)).setDim(::llvm::cast<VectorType>($_self).getRank() - 1, 16))">;
+
+def ConvertFromSvboolOp : ArmSVE_Op<"convert_from_svbool",
+                            [Pure, SvboolTypeContraint<"result", "source">]>
+{
+  let summary = "Convert a svbool type to a SVE predicate type";
+  let description = [{
+    Converts svbool types (`vector<[16]xi1>` or vectors of that type, e.g.
+    `vector<2x3x[16]xi1>`) to SVE predicate types. Note: Only the trailing
+    dimension can be scalable.
+
+    Example 1: Convert a 1-D svbool mask to a SVE predicate.
+    ```mlir
+    %svbool = vector.load %memref[%c0] : memref<?xi1>, vector<[16]xi1>
+    %mask = arm_sve.convert_from_svbool %svbool : vector<[4]xi1>
+    ```
+
+    Example 2: Convert a 2-D svbool mask to a mask of SVE predicates.
+    ```mlir
+    %svbool = vector.load %memref[%c0, %c0] : memref<2x?xi1>, vector<2x[16]xi1>
+    %mask = arm_sve.convert_from_svbool %svbool : vector<2x[8]xi1>
+    ```
+  }];
+  let arguments = (ins SVBoolMask:$source);
+  let results = (outs SVEMask:$result);
+  let assemblyFormat = "$source attr-dict `:` type($result)";
+}
+
+def ConvertToSvboolOp : ArmSVE_Op<"convert_to_svbool",
+                            [Pure, SvboolTypeContraint<"source", "result">]>
+{
+  let summary = "Convert a predicate type to a svbool type";
+  let description = [{
+    Converts SVE predicate types (or vectors of predicate types, e.g.
+    `vector<4x[4]xi1>`) to svbool types. Note: Only the trailing dimension can
+    be scalable.
+
+    Example 1: Convert a 1-D SVE predicate to a svbool mask.
+    ```mlir
+    %mask = vector.create_mask %dim_size : vector<[4]xi1>
+    %svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
+    // => Results in vector<[16]xi1>
+    ```
+
+    Example 2: Convert a 2-D mask of SVE predicates to a svbool mask.
+    ```mlir
+    %mask = vector.create_mask %c2, %dim_size : vector<2x[2]xi1>
+    %svbool = arm_sve.convert_to_svbool %mask : vector<2x[2]xi1>
+    // => Results in vector<2x[16]xi1>
+    ```
+  }];
+  let arguments = (ins SVEMask:$source);
+  let results = (outs SVBoolMask:$result);
+  let assemblyFormat = "$source attr-dict `:` type($source)";
+}
+
 def ScalableMaskedAddIOp : ScalableMaskedIOp<"masked.addi", "addition",
                                              [Commutative]>;
 
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 4fc14e30b8a10d0..54a5a97fe2b6425 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -37,6 +37,12 @@ def IsFixedVectorTypePred : CPred<[{::llvm::isa<::mlir::VectorType>($_self) &&
 def IsScalableVectorTypePred : CPred<[{::llvm::isa<::mlir::VectorType>($_self) &&
                                    ::llvm::cast<VectorType>($_self).isScalable()}]>;
 
+// Whether a type is a scalable VectorType, with a single trailing scalable dimension.
+def IsTrailingScalableVectorTypePred : And<[CPred<"::llvm::isa<::mlir::VectorType>($_self)">,
+                            CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">,
+                            CPred<"::llvm::cast<::mlir::VectorType>($_self).getScalableDims().back()">,
+                            CPred<"!llvm::is_contained(::llvm::cast<::mlir::VectorType>($_self).getScalableDims().drop_back(), true)">]>;
+
 // Whether a type is a VectorType and all dimensions are scalable.
 def allDimsScalableVectorTypePred : And<[
   IsVectorTypePred,
@@ -404,6 +410,10 @@ class ScalableVectorOf<list<Type> allowedTypes> :
   ShapedContainerType<allowedTypes, IsScalableVectorTypePred,
           "scalable vector", "::mlir::VectorType">;
 
+class TrailingScalableVectorOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsTrailingScalableVectorTypePred,
+          "trailing scalable vector", "::mlir::VectorType">;
+
 // Whether the number of elements of a vector is from the given
 // `allowedRanks` list
 class IsVectorOfRankPred<list<int> allowedRanks> :
@@ -481,10 +491,32 @@ class IsScalableVectorOfLengthPred<list<int> allowedLengths> :
                            == }]
                          # allowedlength>)>]>;
 
+class abs<int value> {
+  int ret = !if(!lt(value, 0), !sub(0, value), value);
+}
+
+// Whether the n-th (starting from 1) dim of the shape matches the given `size`.
+// Negative values index in reverse.
+class IsNthDimSizeIsOneOfPred<int n, list<int> allowedSizes>
+  : And<[CPred<"::llvm::cast<::mlir::ShapedType>($_self).getRank() >= " # abs<n>.ret>,
+        CPred<"::llvm::is_contained(ArrayRef<int64_t>({" # !interleave(allowedSizes, ", ") # "}), "
+          # "::llvm::cast<::mlir::ShapedType>($_self).getDimSize("
+          #   !if(!lt(n, 0),
+                "::llvm::cast<::mlir::ShapedType>($_self).getRank() + " # n,
+                "" # !sub(n, 1))
+          # "))">]>;
+
 // Whether the shape of a vector matches the given `shape` list.
 class IsVectorOfShape<list<int> shape>
   : CPred<"::llvm::cast<::mlir::VectorType>($_self).getShape() == ArrayRef<int64_t>({" # !interleave(shape, ", ") # "})">;
 
+// Any ShapedType where the size of the n-th dim is contained in `sizes`.
+// Negative values index in reverse.
+class ShapedTypeWithNthDimOfSize<int n, list<int> allowedSizes> : Type<
+  IsNthDimSizeIsOneOfPred<n, allowedSizes>,
+  " with dim " # n # " having a size of {" # !interleave(allowedSizes, ", ") # "}",
+  "::mlir::ShapedType">;
+
 // Any vector where the number of elements is from the given
 // `allowedLengths` list
 class VectorOfLength<list<int> allowedLengths> : Type<
@@ -546,6 +578,17 @@ class ScalableVectorOfRankAndLengthAndType<list<int> allowedRanks,
   ScalableVectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
+// Any scalable vector with a single trailing scalable dimensions, where the
+// size of the trailing dimension is in `allowedTrailingSizes` list, and the
+// type is in the `allowedTypes` list.
+class TrailingScalableVectorOfSizeAndType<list<int> allowedTrailingSizes,
+                                           list<Type> allowedTypes> : AllOfType<
+  [TrailingScalableVectorOf<allowedTypes>,
+   ShapedTypeWithNthDimOfSize<-1, allowedTrailingSizes>],
+   TrailingScalableVectorOf<allowedTypes>.summary #
+   ShapedTypeWithNthDimOfSize<-1, allowedTrailingSizes>.summary,
+  "::mlir::VectorType">;
+
 def AnyVector : VectorOf<[AnyType]>;
 // Temporary vector type clone that allows gradual transition to 0-D vectors.
 def AnyVectorOfAnyRank : VectorOfAnyRankOf<[AnyType]>;
diff --git a/mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp b/mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp
index b7f1020deba1e40..594c9b4c270f218 100644
--- a/mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp
+++ b/mlir/lib/Dialect/ArmSVE/IR/ArmSVEDialect.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
diff --git a/mlir/lib/Dialect/ArmSVE/IR/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/IR/CMakeLists.txt
index fffc77245d12c93..9ef7384fc54925a 100644
--- a/mlir/lib/Dialect/ArmSVE/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/IR/CMakeLists.txt
@@ -10,5 +10,6 @@ add_mlir_dialect_library(MLIRArmSVEDialect
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRVectorDialect
   MLIRSideEffectInterfaces
   )
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
index 7031ab4f799c4d2..2f1c43fae240d51 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRArmSVETransforms
   LINK_LIBS PUBLIC
   MLIRArmSVEDialect
   MLIRFuncDialect
+  MLIRVectorDialect
   MLIRIR
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
index abbb978304068e2..d280d2415ecdbfd 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
@@ -12,6 +12,8 @@
 #include "mlir/Dialect/ArmSVE/Transforms/Transforms.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
 
@@ -66,6 +68,54 @@ using ScalableMaskedDivFOpLowering =
     OneToOneConvertToLLVMPattern<ScalableMaskedDivFOp,
                                  ScalableMaskedDivFIntrOp>;
 
+namespace {
+
+template <typename Op, typename IntrOp>
+struct SvboolConversionOpLowering : public ConvertOpToLLVMPattern<Op> {
+  using ConvertOpToLLVMPattern<Op>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(Op convertOp, typename Op::Adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = convertOp.getLoc();
+
+    auto source = convertOp.getSource();
+    VectorType sourceType = source.getType();
+    VectorType resultType = convertOp.getResult().getType();
+
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, resultType, rewriter.getZeroAttr(resultType));
+
+    SmallVector<int64_t> tileShape(sourceType.getRank(), 1);
+    tileShape.back() = sourceType.getShape().back();
+
+    for (SmallVector<int64_t> index :
+         StaticTileOffsetRange(sourceType.getShape(), tileShape)) {
+      auto extractOrInsertPosition = ArrayRef(index).drop_back();
+      auto sourceVector = rewriter.create<vector::ExtractOp>(
+          loc, source, extractOrInsertPosition);
+      auto convertedType =
+          VectorType::Builder(llvm::cast<VectorType>(sourceVector.getType()))
+              .setDim(0, resultType.getShape().back());
+      auto convertedVector =
+          rewriter.create<IntrOp>(loc, TypeRange{convertedType}, sourceVector);
+      result = rewriter.create<vector::InsertOp>(loc, convertedVector, result,
+                                                 extractOrInsertPosition);
+    }
+
+    rewriter.replaceOp(convertOp, result);
+    return success();
+  }
+};
+
+using ConvertToSvboolOpLowering =
+    SvboolConversionOpLowering<ConvertToSvboolOp, ConvertToSvboolIntrOp>;
+
+using ConvertFromSvboolOpLowering =
+    SvboolConversionOpLowering<ConvertFromSvboolOp, ConvertFromSvboolIntrOp>;
+
+} // namespace
+
 /// Populate the given list with patterns that convert from ArmSVE to LLVM.
 void mlir::populateArmSVELegalizeForLLVMExportPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns) {
@@ -88,7 +138,9 @@ void mlir::populateArmSVELegalizeForLLVMExportPatterns(
                ScalableMaskedMulFOpLowering,
                ScalableMaskedSDivIOpLowering,
                ScalableMaskedUDivIOpLowering,
-               ScalableMaskedDivFOpLowering>(converter);
+               ScalableMaskedDivFOpLowering,
+               ConvertToSvboolOpLowering,
+               ConvertFromSvboolOpLowering>(converter);
   // clang-format on
 }
 
@@ -107,7 +159,9 @@ void mlir::configureArmSVELegalizeForExportTarget(
                     ScalableMaskedMulFIntrOp,
                     ScalableMaskedSDivIIntrOp,
                     ScalableMaskedUDivIIntrOp,
-                    ScalableMaskedDivFIntrOp>();
+                    ScalableMaskedDivFIntrOp,
+                    ConvertToSvboolIntrOp,
+                    ConvertFromSvboolIntrOp>();
   target.addIllegalOp<SdotOp,
                       SmmlaOp,
                       UdotOp,
@@ -120,6 +174,8 @@ void mlir::configureArmSVELegalizeForExportTarget(
                       ScalableMaskedMulFOp,
                       ScalableMaskedSDivIOp,
                       ScalableMaskedUDivIOp,
-                      ScalableMaskedDivFOp>();
+                      ScalableMaskedDivFOp,
+                      ConvertToSvboolOp,
+                      ConvertFromSvboolOp>();
   // clang-format on
 }
diff --git a/mlir/test/Dialect/ArmSVE/invalid.mlir b/mlir/test/Dialect/ArmSVE/invalid.mlir
new file mode 100644
index 000000000000000..a1fa0d0292b7b76
--- /dev/null
+++ b/mlir/test/Dialect/ArmSVE/invalid.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+
+func.func @arm_sve_convert_from_svbool__bad_mask_type(%bool: vector<2x[16]xi1>) -> vector<2x[8]xi2> {
+  // expected-error at +1 {{'result' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<2x[8]xi2>'}}
+  %mask = arm_sve.convert_from_svbool %bool : vector<2x[8]xi2>
+  return %mask : vector<2x[8]xi2>
+}
+
+// -----
+
+func.func @arm_sve_convert_from_svbool__bad_mask_shape(%bool : vector<[16]xi1>) -> vector<[7]xi1> {
+  // expected-error at +1 {{'result' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<[7]xi1>'}}
+  %mask = arm_sve.convert_from_svbool %bool : vector<[7]xi1>
+  return %mask : vector<[7]xi1>
+}
+
+// -----
+
+func.func @arm_sve_convert_from_svbool__bad_mask_scalability(%bool : vector<[4]x[16]xi1>) -> vector<[4]x[8]xi1> {
+  // expected-error at +1 {{'result' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<[4]x[8]xi1>'}}
+  %mask = arm_sve.convert_from_svbool %bool : vector<[4]x[8]xi1>
+  return %mask : vector<[4]x[8]xi1>
+}
+
+// -----
+
+func.func @arm_sve_convert_to_svbool__bad_mask_type(%mask: vector<2x[8]xi2>) -> vector<2x[16]xi1> {
+  // expected-error at +1 {{'source' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<2x[8]xi2>'}}
+  %bool = arm_sve.convert_to_svbool %mask : vector<2x[8]xi2>
+  return %bool : vector<2x[16]xi1>
+}
+
+// -----
+
+func.func @arm_sve_convert_to_svbool__bad_mask_shape(%mask : vector<[7]xi1>) -> vector<[16]xi1> {
+  // expected-error at +1 {{'source' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<[7]xi1>'}}
+  %bool = arm_sve.convert_to_svbool %mask : vector<[7]xi1>
+  return
+}
+
+// -----
+
+func.func @arm_sve_convert_to_svbool__bad_mask_scalability(%mask : vector<[4]x[8]xi1>) -> vector<[4]x[16]xi1> {
+  // expected-error at +1 {{'source' must be trailing scalable vector of 1-bit signless integer values with dim -1 having a size of {16, 8, 4, 2, 1}, but got 'vector<[4]x[8]xi1>'}}
+  %bool = arm_sve.convert_to_svbool %mask : vector<[4]x[8]xi1>
+  return
+}
+
+
diff --git a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
index 2d980db981034dd..04f2f43e6a5e78d 100644
--- a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
+++ b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm="enable-arm-sve" -convert-func-to-llvm -reconcile-unrealized-casts | mlir-opt | FileCheck %s
+// RUN: mlir-opt -convert-vector-to-llvm="enable-arm-sve" -convert-func-to-llvm -reconcile-unrealized-casts -canonicalize -split-input-file %s | FileCheck %s
 
 func.func @arm_sve_sdot(%a: vector<[16]xi8>,
                    %b: vector<[16]xi8>,
@@ -10,6 +10,8 @@ func.func @arm_sve_sdot(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_smmla(%a: vector<[16]xi8>,
                     %b: vector<[16]xi8>,
                     %c: vector<[4]xi32>)
@@ -20,6 +22,8 @@ func.func @arm_sve_smmla(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_udot(%a: vector<[16]xi8>,
                    %b: vector<[16]xi8>,
                    %c: vector<[4]xi32>)
@@ -30,6 +34,8 @@ func.func @arm_sve_udot(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_ummla(%a: vector<[16]xi8>,
                     %b: vector<[16]xi8>,
                     %c: vector<[4]xi32>)
@@ -40,6 +46,8 @@ func.func @arm_sve_ummla(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_arithi_masked(%a: vector<[4]xi32>,
                             %b: vector<[4]xi32>,
                             %c: vector<[4]xi32>,
@@ -65,6 +73,8 @@ func.func @arm_sve_arithi_masked(%a: vector<[4]xi32>,
   return %4 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_arithf_masked(%a: vector<[4]xf32>,
                             %b: vector<[4]xf32>,
                             %c: vector<[4]xf32>,
@@ -87,6 +97,8 @@ func.func @arm_sve_arithf_masked(%a: vector<[4]xf32>,
   return %3 : vector<[4]xf32>
 }
 
+// -----
+
 func.func @arm_sve_abs_diff(%a: vector<[4]xi32>,
                        %b: vector<[4]xi32>)
                        -> vector<[4]xi32> {
@@ -111,8 +123,67 @@ func.func @arm_sve_abs_diff(%a: vector<[4]xi32>,
   return %3 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @get_vector_scale() -> index {
   // CHECK: llvm.intr.vscale
   %0 = vector.vscale
   return %0 : index
 }
+
+// -----
+
+func.func @convert_1d_mask_to_svbool(%mask: vector<[4]xi1>) -> vector<[16]xi1>
+{
+  // CHECK: "arm_sve.intr.convert.to.svbool"(%{{.*}}) : (vector<[4]xi1>) -> vector<[16]xi1>
+  %svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
+  return %svbool : vector<[16]xi1>
+}
+
+// -----
+
+func.func @convert_1d_mask_from_svbool(%svbool: vector<[16]xi1>) -> vector<[2]xi1>
+{
+  // CHECK: "arm_sve.intr.convert.from.svbool"(%{{.*}}) : (vector<[16]xi1>) -> vector<[2]xi1>
+  %mask = arm_sve.convert_from_svbool %svbool : vector<[2]xi1>
+  return %mask : vector<[2]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @convert_2d_mask_to_svbool(
+// CHECK-SAME:                             %[[MASK:.*]]: !llvm.array<2 x vector<[8]xi1>>)
+func.func @convert_2d_mask_to_svbool(%mask: vector<2x[8]xi1>) -> vector<2x[16]xi1>
+{
+  // CHECK-NEXT:    %[[RES0:.*]] = llvm.mlir.constant(dense<false> : vector<2x[16]xi1>) : !llvm.array<2 x vector<[16]xi1>>
+  // CHECK-NEXT:   %[[MASK0:.*]] = llvm.extractvalue %[[MASK]][0] : !llvm.array<2 x vector<[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL0:.*]] = "arm_sve.intr.convert.to.svbool"(%[[MASK0]]) : (vector<[8]xi1>) -> vector<[16]xi1>
+  // CHECK-NEXT:    %[[RES1:.*]] = llvm.insertvalue %[[SVBOOL0]], %[[RES0]][0] : !llvm.array<2 x vector<[16]xi1>>
+  // CHECK-NEXT:   %[[MASK1:.*]] = llvm.extractvalue %[[MASK]][1] : !llvm.array<2 x vector<[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL1:.*]] = "arm_sve.intr.convert.to.svbool"(%[[MASK1]]) : (vector<[8]xi1>) -> vector<[16]xi1>
+  // CHECK-NEXT:  %[[SVBOOL:.*]] = llvm.insertvalue %[[SVBOOL1]], %[[RES1]][1] : !llvm.array<2 x vector<[16]xi1>>
+  %svbool = arm_sve.convert_to_svbool %mask : vector<2x[8]xi1>
+  // CHECK-NEXT: llvm.return %[[SVBOOL]] : !llvm.array<2 x vector<[16]xi1>>
+  return %svbool : vector<2x[16]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @convert_2d_mask_from_svbool(
+// CHECK-SAME:                               %[[SVBOOL:.*]]: !llvm.array<3 x vector<[16]xi1>>)
+func.func @convert_2d_mask_from_svbool(%svbool: vector<3x[16]xi1>) -> vector<3x[1]xi1>
+{
+  // CHECK-NEXT:    %[[RES0:.*]] = llvm.mlir.constant(dense<false> : vector<3x[1]xi1>) : !llvm.array<3 x vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL0:.*]] = llvm.extractvalue %[[SVBOOL]][0] : !llvm.array<3 x vector<[16]xi1>>
+  // CHECK-NEXT:   %[[MASK0:.*]] = "arm_sve.intr.convert.from.svbool"(%[[SVBOOL0]]) : (vector<[16]xi1>) -> vector<[1]xi1>
+  // CHECK-NEXT:    %[[RES1:.*]] = llvm.insertvalue %[[MASK0]], %[[RES0]][0] : !llvm.array<3 x vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL1:.*]] = llvm.extractvalue %[[SVBOOL]][1] : !llvm.array<3 x vector<[16]xi1>>
+  // CHECK-NEXT:   %[[MASK1:.*]] = "arm_sve.intr.convert.from.svbool"(%[[SVBOOL1]]) : (vector<[16]xi1>) -> vector<[1]xi1>
+  // CHECK-NEXT:    %[[RES2:.*]] = llvm.insertvalue %[[MASK1]], %[[RES1]][1] : !llvm.array<3 x vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL2:.*]] = llvm.extractvalue %[[SVBOOL]][2] : !llvm.array<3 x vector<[16]xi1>>
+  // CHECK-NEXT:   %[[MASK2:.*]] = "arm_sve.intr.convert.from.svbool"(%[[SVBOOL2]]) : (vector<[16]xi1>) -> vector<[1]xi1>
+  // CHECK-NEXT:    %[[MASK:.*]] = llvm.insertvalue %[[MASK2]], %[[RES2]][2] : !llvm.array<3 x vector<[1]xi1>>
+  %mask = arm_sve.convert_from_svbool %svbool : vector<3x[1]xi1>
+  // CHECK-NEXT: llvm.return %[[MASK]] : !llvm.array<3 x vector<[1]xi1>>
+  return %mask : vector<3x[1]xi1>
+}
diff --git a/mlir/test/Dialect/ArmSVE/roundtrip.mlir b/mlir/test/Dialect/ArmSVE/roundtrip.mlir
index d2ca035c17bfbcf..af390bb330a341d 100644
--- a/mlir/test/Dialect/ArmSVE/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSVE/roundtrip.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -verify-diagnostics %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt -verify-diagnostics -split-input-file %s | mlir-opt | FileCheck %s
 
 func.func @arm_sve_sdot(%a: vector<[16]xi8>,
                    %b: vector<[16]xi8>,
@@ -9,6 +9,8 @@ func.func @arm_sve_sdot(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_smmla(%a: vector<[16]xi8>,
                     %b: vector<[16]xi8>,
                     %c: vector<[4]xi32>) -> vector<[4]xi32> {
@@ -18,6 +20,8 @@ func.func @arm_sve_smmla(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_udot(%a: vector<[16]xi8>,
                    %b: vector<[16]xi8>,
                    %c: vector<[4]xi32>) -> vector<[4]xi32> {
@@ -27,6 +31,8 @@ func.func @arm_sve_udot(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_ummla(%a: vector<[16]xi8>,
                     %b: vector<[16]xi8>,
                     %c: vector<[4]xi32>) -> vector<[4]xi32> {
@@ -36,6 +42,8 @@ func.func @arm_sve_ummla(%a: vector<[16]xi8>,
   return %0 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_masked_arithi(%a: vector<[4]xi32>,
                             %b: vector<[4]xi32>,
                             %c: vector<[4]xi32>,
@@ -61,6 +69,8 @@ func.func @arm_sve_masked_arithi(%a: vector<[4]xi32>,
   return %2 : vector<[4]xi32>
 }
 
+// -----
+
 func.func @arm_sve_masked_arithf(%a: vector<[4]xf32>,
                             %b: vector<[4]xf32>,
                             %c: vector<[4]xf32>,
@@ -82,3 +92,40 @@ func.func @arm_sve_masked_arithf(%a: vector<[4]xf32>,
                                            vector<[4]xf32>
   return %3 : vector<[4]xf32>
 }
+
+// -----
+
+func.func @arm_sve_convert_to_svbool(%a: vector<[1]xi1>,
+                                     %b: vector<[2]xi1>,
+                                     %c: vector<[4]xi1>,
+                                     %d: vector<[8]xi1>) {
+  // CHECK: arm_sve.convert_to_svbool %{{.*}} : vector<[1]xi1>
+  %1 = arm_sve.convert_to_svbool %a : vector<[1]xi1>
+
+  // CHECK: arm_sve.convert_to_svbool %{{.*}} : vector<[2]xi1>
+  %2 = arm_sve.convert_to_svbool %b : vector<[2]xi1>
+
+  // CHECK: arm_sve.convert_to_svbool %{{.*}} : vector<[4]xi1>
+  %3 = arm_sve.convert_to_svbool %c : vector<[4]xi1>
+
+  // CHECK: arm_sve.convert_to_svbool %{{.*}} : vector<[8]xi1>
+  %4 = arm_sve.convert_to_svbool %d : vector<[8]xi1>
+  return
+}
+
+// -----
+
+func.func @arm_sve_convert_from_svbool(%bool: vector<[16]xi1>) {
+  // CHECK: arm_sve.convert_from_svbool %{{.*}} : vector<[1]xi1>
+  %1 = arm_sve.convert_from_svbool %bool : vector<[1]xi1>
+
+  // CHECK: arm_sve.convert_from_svbool %{{.*}} : vector<[2]xi1>
+  %2 = arm_sve.convert_from_svbool %bool : vector<[2]xi1>
+
+  // CHECK: arm_sve.convert_from_svbool %{{.*}} : vector<[4]xi1>
+  %3 = arm_sve.convert_from_svbool %bool : vector<[4]xi1>
+
+  // CHECK: arm_sve.convert_from_svbool %{{.*}} : vector<[8]xi1>
+  %4 = arm_sve.convert_from_svbool %bool : vector<[8]xi1>
+  return
+}

>From 40ff7db61a1938cdcd22d247cec19d4696d68618 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 10 Oct 2023 18:53:05 +0000
Subject: [PATCH 4/4] [mlir][ArmSVE] Add `-arm-sve-legalize-vector-storage`
 pass

This patch adds a pass that ensures that loads, stores, and allocations
of SVE vector types will be legal in the LLVM backend. It does this at
the memref level, so this pass must be applied before lowering all the
way to LLVM.

This pass currently fixes two issues.

It is only legal to load/store predicate types equal to (or greater
than) a full predicate register, which in MLIR is `vector<[16]xi1>`.
Smaller predicate types (`vector<[1|2|4|8]xi1>`) must be converted
to/from a full predicate type (referred to as a `svbool`) before and
after storing and loading respectively. This pass does this by widening
allocations and inserting conversion intrinsics.

For example:

```mlir
%alloca = memref.alloca() : memref<vector<[4]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
%reload = memref.load %alloca[] : memref<vector<[4]xi1>>
```
Becomes:
```mlir
%alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
%svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
%reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
%reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
```

The storage for SVE vector types only needs to have an alignment that
matches the element type (for example 4 byte alignment for `f32`s).
However, the LLVM backend currently defaults to aligning to `base size`
x `element size` bytes. For non-legal vector types like
`vector<[8]xf32>` this results in 8 x 4 = 32-byte alignment, but the
backend only supports up to 16-byte alignment for SVE vectors on the
stack. Explicitly setting a smaller alignment prevents this issue.
---
 .../mlir/Dialect/ArmSVE/CMakeLists.txt        |   1 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   5 +
 .../mlir/Dialect/ArmSVE/Transforms/Passes.h   |  33 ++
 .../mlir/Dialect/ArmSVE/Transforms/Passes.td  |  67 ++++
 mlir/include/mlir/InitAllPasses.h             |   2 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   2 +
 .../Transforms/LegalizeVectorStorage.cpp      | 310 ++++++++++++++++++
 .../ArmSVE/legalize-vector-storage.mlir       | 160 +++++++++
 .../ArmSVE/arrays-of-scalable-vectors.mlir    | 121 +++++++
 9 files changed, 701 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
 create mode 100644 mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
 create mode 100644 mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir

diff --git a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
index f33061b2d87cffc..9f57627c321fb0c 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
new file mode 100644
index 000000000000000..7226642daf86172
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name ArmSVE)
+add_public_tablegen_target(MLIRArmSVEPassIncGen)
+
+add_mlir_doc(Passes ArmSVEPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
new file mode 100644
index 000000000000000..317fb9021b3c577
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::arm_sve {
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+/// Pass to legalize the types of mask stores.
+std::unique_ptr<Pass> createLegalizeVectorStoragePass();
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+} // namespace mlir::arm_sve
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
new file mode 100644
index 000000000000000..35c49607181da0c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
@@ -0,0 +1,67 @@
+//===-- Passes.td - ArmSVE pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+def LegalizeVectorStorage
+    : Pass<"arm-sve-legalize-vector-storage", "mlir::func::FuncOp"> {
+  let summary = "Ensures stores of SVE vector types will be legal";
+  let description = [{
+    This pass ensures that loads, stores, and allocations of SVE vector types
+    will be legal in the LLVM backend. It does this at the memref level, so this
+    pass must be applied before lowering all the way to LLVM.
+
+    This pass currently fixes two issues.
+
+    ## Loading and storing predicate types
+
+    It is only legal to load/store predicate types equal to (or greater than) a
+    full predicate register, which in MLIR is `vector<[16]xi1>`. Smaller
+    predicate types (`vector<[1|2|4|8]xi1>`) must be converted to/from a full
+    predicate type (referred to as a `svbool`) before and after storing and
+    loading respectively. This pass does this by widening allocations and
+    inserting conversion intrinsics.
+
+    For example:
+
+    ```mlir
+    %alloca = memref.alloca() : memref<vector<[4]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+    %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+    ```
+    Becomes:
+    ```mlir
+    %alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    %svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
+    memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
+    %reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
+    %reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
+    ```
+
+    ## Relax alignments for SVE vector allocas
+
+    The storage for SVE vector types only needs to have an alignment that
+    matches the element type (for example 4 byte alignment for `f32`s). However,
+    the LLVM backend currently defaults to aligning to `base size` x
+    `element size` bytes. For non-legal vector types like `vector<[8]xf32>` this
+    results in 8 x 4 = 32-byte alignment, but the backend only supports up to
+    16-byte alignment for SVE vectors on the stack. Explicitly setting a smaller
+    alignment prevents this issue.
+  }];
+  let constructor = "mlir::arm_sve::createLegalizeVectorStoragePass()";
+  let dependentDialects = ["func::FuncDialect",
+    "memref::MemRefDialect", "vector::VectorDialect",
+    "arm_sve::ArmSVEDialect"];
+}
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 5489a13a8040bdb..7301905954f56d8 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
 #include "mlir/Dialect/Async/Passes.h"
 #include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
@@ -82,6 +83,7 @@ inline void registerAllPasses() {
   transform::registerTransformPasses();
   vector::registerVectorPasses();
   arm_sme::registerArmSMEPasses();
+  arm_sve::registerArmSVEPasses();
 
   // Dialect pipelines
   bufferization::registerBufferizationPipelines();
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
index 2f1c43fae240d51..a70c489a51fea9a 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -1,8 +1,10 @@
 add_mlir_dialect_library(MLIRArmSVETransforms
   LegalizeForLLVMExport.cpp
+  LegalizeVectorStorage.cpp
 
   DEPENDS
   MLIRArmSVEConversionsIncGen
+  MLIRArmSVEPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRArmSVEDialect
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
new file mode 100644
index 000000000000000..610eb38089c4c88
--- /dev/null
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -0,0 +1,310 @@
+//===- LegalizeVectorStorage.cpp - Ensures SVE loads/stores are legal -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::arm_sve {
+#define GEN_PASS_DEF_LEGALIZEVECTORSTORAGE
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+} // namespace mlir::arm_sve
+
+using namespace mlir;
+using namespace mlir::arm_sve;
+
+constexpr StringLiteral kPassLabel("__arm_sve_legalize_vector_storage__");
+
+namespace {
+
+/// A (legal) SVE predicate mask that has a logical size, i.e. the number of
+/// bits match the number of lanes it masks (such as vector<[4]xi1>), but is too
+/// small to be stored to memory.
+bool isLogicalSVEPredicateType(VectorType type) {
+  return type.getRank() > 0 && type.getElementType().isInteger(1) &&
+         type.getScalableDims().back() && type.getShape().back() < 16 &&
+         llvm::isPowerOf2_32(type.getShape().back()) &&
+         !llvm::is_contained(type.getScalableDims().drop_back(), true);
+}
+
+VectorType widenScalableMaskTypeToSvbool(VectorType type) {
+  assert(isLogicalSVEPredicateType(type));
+  return VectorType::Builder(type).setDim(type.getRank() - 1, 16);
+}
+
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithLegalizedOp(PatternRewriter &rewriter, TOp op,
+                              TLegalizerCallback callback) {
+  // Clone the previous op to preserve any properties/attributes.
+  auto newOp = op.clone();
+  rewriter.insert(newOp);
+  rewriter.replaceOp(op, callback(newOp));
+}
+
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithUnrealizedConversion(PatternRewriter &rewriter, TOp op,
+                                       TLegalizerCallback callback) {
+  replaceOpWithLegalizedOp(rewriter, op, [&](TOp newOp) {
+    // Mark our `unrealized_conversion_casts` with a pass label.
+    return rewriter.create<UnrealizedConversionCastOp>(
+        op.getLoc(), TypeRange{op.getResult().getType()},
+        ValueRange{callback(newOp)},
+        NamedAttribute(rewriter.getStringAttr(kPassLabel),
+                       rewriter.getUnitAttr()));
+  });
+}
+
+/// Extracts the legal memref value from the `unrealized_conversion_casts` added
+/// by this pass.
+static FailureOr<Value> getLegalMemRef(Value illegalMemref) {
+  Operation *definingOp = illegalMemref.getDefiningOp();
+  if (!definingOp || !definingOp->hasAttr(kPassLabel))
+    return failure();
+  auto unrealizedConversion =
+      llvm::cast<UnrealizedConversionCastOp>(definingOp);
+  return unrealizedConversion.getOperand(0);
+}
+
+/// The default alignment of an alloca may request overaligned sizes for SVE
+/// types, which will fail during stack frame allocation. This rewrite
+/// explicitly adds a reasonable alignment to allocas of scalable types.
+struct RelaxScalableVectorAllocaAlignment
+    : public OpRewritePattern<memref::AllocaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::AllocaOp allocaOp,
+                                PatternRewriter &rewriter) const override {
+    auto elementType = allocaOp.getType().getElementType();
+    auto vectorType = llvm::dyn_cast<VectorType>(elementType);
+    if (!vectorType || !vectorType.isScalable() || allocaOp.getAlignment())
+      return failure();
+
+    unsigned elementByteSize =
+        vectorType.getElementType().getIntOrFloatBitWidth() / 8;
+
+    unsigned aligment = std::max(1u, elementByteSize);
+    allocaOp.setAlignment(aligment);
+
+    return success();
+  }
+};
+
+/// Replaces allocations of SVE predicates smaller than an svbool with a wider
+/// allocation and a tagged unrealized conversion.
+///
+/// Example
+/// ```
+/// %alloca = memref.alloca() : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[4]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+template <typename AllocLikeOp>
+struct LegalizeAllocLikeOpConversion : public OpRewritePattern<AllocLikeOp> {
+  using OpRewritePattern<AllocLikeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AllocLikeOp allocLikeOp,
+                                PatternRewriter &rewriter) const override {
+    auto vectorType =
+        llvm::dyn_cast<VectorType>(allocLikeOp.getType().getElementType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    // Replace this alloc-like op of an SVE mask with one of a (storable)
+    // svbool_t mask. A temporary unrealized_conversion_cast is added to the old
+    // type to allow local rewrites.
+    replaceOpWithUnrealizedConversion(
+        rewriter, allocLikeOp, [&](AllocLikeOp newAllocLikeOp) {
+          newAllocLikeOp.getResult().setType(
+              llvm::cast<MemRefType>(newAllocLikeOp.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newAllocLikeOp;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces vector.type_casts of unrealized conversions to illegal memref types
+/// with legal type casts, followed by unrealized conversions.
+///
+/// Example:
+/// ```
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// %cast = vector.type_cast %alloca
+///   : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened_cast = vector.type_cast %widened
+///   : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+/// %cast = builtin.unrealized_conversion_cast %widened_cast
+///   : memref<3xvector<[16]xi1>> to memref<3xvector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+struct LegalizeVectorTypeCastConversion
+    : public OpRewritePattern<vector::TypeCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TypeCastOp typeCastOp,
+                                PatternRewriter &rewriter) const override {
+    auto resultType = typeCastOp.getResultMemRefType();
+    auto vectorType = llvm::dyn_cast<VectorType>(resultType.getElementType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(typeCastOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    // Replace this vector.type_cast with one of a (storable) svbool_t mask.
+    replaceOpWithUnrealizedConversion(
+        rewriter, typeCastOp, [&](vector::TypeCastOp newTypeCast) {
+          newTypeCast.setOperand(*legalMemref);
+          newTypeCast.getResult().setType(
+              llvm::cast<MemRefType>(newTypeCast.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newTypeCast;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces stores to unrealized conversions to illegal memref types with
+/// `arm_sve.convert_to_svbool`s followed by (legal) wider stores.
+///
+/// Example:
+/// ```
+/// memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = arm_sve.convert_to_svbool %mask : vector<[8]xi1>
+/// memref.store %svbool, %widened[] : memref<vector<[16]xi1>>
+/// ```
+struct LegalizeMemrefStoreConversion
+    : public OpRewritePattern<memref::StoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::StoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = storeOp.getLoc();
+
+    Value valueToStore = storeOp.getValueToStore();
+    auto vectorType = llvm::dyn_cast<VectorType>(valueToStore.getType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(storeOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(
+        llvm::cast<VectorType>(valueToStore.getType()));
+    auto convertToSvbool = rewriter.create<arm_sve::ConvertToSvboolOp>(
+        loc, legalMaskType, valueToStore);
+    // Replace this store with a conversion to a storable svbool_t mask,
+    // followed by a wider store.
+    replaceOpWithLegalizedOp(rewriter, storeOp,
+                             [&](memref::StoreOp newStoreOp) {
+                               newStoreOp.setOperand(0, convertToSvbool);
+                               newStoreOp.setOperand(1, *legalMemref);
+                               return newStoreOp;
+                             });
+
+    return success();
+  }
+};
+
+/// Replaces loads from unrealized conversions to illegal memref types with
+/// (legal) wider loads, followed by `arm_sve.convert_from_svbool`s.
+///
+/// Example:
+/// ```
+/// %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = memref.load %widened[] : memref<vector<[16]xi1>>
+/// %reload = arm_sve.convert_from_svbool %reload : vector<[4]xi1>
+/// ```
+struct LegalizeMemrefLoadConversion : public OpRewritePattern<memref::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::LoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = loadOp.getLoc();
+
+    Value loadedMask = loadOp.getResult();
+    auto vectorType = llvm::dyn_cast<VectorType>(loadedMask.getType());
+
+    if (!vectorType || !isLogicalSVEPredicateType(vectorType))
+      return failure();
+
+    auto legalMemref = getLegalMemRef(loadOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(vectorType);
+    // Replace this load with a legal load of an svbool_t type, followed by a
+    // conversion back to the original type.
+    replaceOpWithLegalizedOp(rewriter, loadOp, [&](memref::LoadOp newLoadOp) {
+      newLoadOp.setMemRef(*legalMemref);
+      newLoadOp.getResult().setType(legalMaskType);
+      return rewriter.create<arm_sve::ConvertFromSvboolOp>(
+          loc, loadedMask.getType(), newLoadOp);
+    });
+
+    return success();
+  }
+};
+
+struct LegalizeVectorStorage
+    : public arm_sve::impl::LegalizeVectorStorageBase<LegalizeVectorStorage> {
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<RelaxScalableVectorAllocaAlignment,
+                 LegalizeAllocLikeOpConversion<memref::AllocaOp>,
+                 LegalizeAllocLikeOpConversion<memref::AllocOp>,
+                 LegalizeVectorTypeCastConversion,
+                 LegalizeMemrefStoreConversion, LegalizeMemrefLoadConversion>(
+        patterns.getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+    ConversionTarget target(getContext());
+    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+        [](UnrealizedConversionCastOp unrealizedConversion) {
+          return !unrealizedConversion->hasAttr(kPassLabel);
+        });
+    // This detects if we failed to completely legalize the IR.
+    if (failed(applyPartialConversion(getOperation(), target, {})))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::arm_sve::createLegalizeVectorStoragePass() {
+  return std::make_unique<LegalizeVectorStorage>();
+}
diff --git a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
new file mode 100644
index 000000000000000..fda8e2e0fab9618
--- /dev/null
+++ b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
@@ -0,0 +1,160 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -arm-sve-legalize-vector-storage -split-input-file -verify-diagnostics | FileCheck %s
+
+/// This tests the basic functionality of the -arm-sve-legalize-vector-storage pass.
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv1i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[1]xi1>)
+func.func @store_and_reload_sve_predicate_nxv1i1(%mask: vector<[1]xi1>) -> vector<[1]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[1]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[1]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[1]xi1>
+  return %reload : vector<[1]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv2i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[2]xi1>)
+func.func @store_and_reload_sve_predicate_nxv2i1(%mask: vector<[2]xi1>) -> vector<[2]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[2]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[2]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[2]xi1>
+  return %reload : vector<[2]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv4i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[4]xi1>)
+func.func @store_and_reload_sve_predicate_nxv4i1(%mask: vector<[4]xi1>) -> vector<[4]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[4]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[4]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[4]xi1>
+  return %reload : vector<[4]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv8i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[8]xi1>)
+func.func @store_and_reload_sve_predicate_nxv8i1(%mask: vector<[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[8]xi1>
+  return %reload : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_2d_mask_and_reload_slice(
+// CHECK-SAME:                                  %[[MASK:.*]]: vector<3x[8]xi1>)
+func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 1 : i64} : memref<vector<3x[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<3x[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<3x[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[UNPACK:.*]] = vector.type_cast %[[ALLOCA]] : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+  %unpack = vector.type_cast %alloca : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[UNPACK]][%[[C0]]] : memref<3xvector<[16]xi1>>
+  // CHECK-NEXT: %[[SLICE:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %slice = memref.load %unpack[%c0] : memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: return %[[SLICE]] : vector<[8]xi1>
+  return %slice : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @set_sve_alloca_alignment
+func.func @set_sve_alloca_alignment() {
+  // CHECK-COUNT-6: alignment = 1
+  %a1 = memref.alloca() : memref<vector<[32]xi8>>
+  %a2 = memref.alloca() : memref<vector<[16]xi8>>
+  %a3 = memref.alloca() : memref<vector<[8]xi8>>
+  %a4 = memref.alloca() : memref<vector<[4]xi8>>
+  %a5 = memref.alloca() : memref<vector<[2]xi8>>
+  %a6 = memref.alloca() : memref<vector<[1]xi8>>
+
+  // CHECK-COUNT-6: alignment = 2
+  %b1 = memref.alloca() : memref<vector<[32]xi16>>
+  %b2 = memref.alloca() : memref<vector<[16]xi16>>
+  %b3 = memref.alloca() : memref<vector<[8]xi16>>
+  %b4 = memref.alloca() : memref<vector<[4]xi16>>
+  %b5 = memref.alloca() : memref<vector<[2]xi16>>
+  %b6 = memref.alloca() : memref<vector<[1]xi16>>
+
+  // CHECK-COUNT-6: alignment = 4
+  %c1 = memref.alloca() : memref<vector<[32]xi32>>
+  %c2 = memref.alloca() : memref<vector<[16]xi32>>
+  %c3 = memref.alloca() : memref<vector<[8]xi32>>
+  %c4 = memref.alloca() : memref<vector<[4]xi32>>
+  %c5 = memref.alloca() : memref<vector<[2]xi32>>
+  %c6 = memref.alloca() : memref<vector<[1]xi32>>
+
+  // CHECK-COUNT-6: alignment = 8
+  %d1 = memref.alloca() : memref<vector<[32]xi64>>
+  %d2 = memref.alloca() : memref<vector<[16]xi64>>
+  %d3 = memref.alloca() : memref<vector<[8]xi64>>
+  %d4 = memref.alloca() : memref<vector<[4]xi64>>
+  %d5 = memref.alloca() : memref<vector<[2]xi64>>
+  %d6 = memref.alloca() : memref<vector<[1]xi64>>
+
+  // CHECK-COUNT-6: alignment = 4
+  %e1 = memref.alloca() : memref<vector<[32]xf32>>
+  %e2 = memref.alloca() : memref<vector<[16]xf32>>
+  %e3 = memref.alloca() : memref<vector<[8]xf32>>
+  %e4 = memref.alloca() : memref<vector<[4]xf32>>
+  %e5 = memref.alloca() : memref<vector<[2]xf32>>
+  %e6 = memref.alloca() : memref<vector<[1]xf32>>
+
+  // CHECK-COUNT-6: alignment = 8
+  %f1 = memref.alloca() : memref<vector<[32]xf64>>
+  %f2 = memref.alloca() : memref<vector<[16]xf64>>
+  %f3 = memref.alloca() : memref<vector<[8]xf64>>
+  %f4 = memref.alloca() : memref<vector<[4]xf64>>
+  %f5 = memref.alloca() : memref<vector<[2]xf64>>
+  %f6 = memref.alloca() : memref<vector<[1]xf64>>
+
+  "prevent.dce"(
+    %a1, %a2, %a3, %a4, %a5, %a6,
+    %b1, %b2, %b3, %b4, %b5, %b6,
+    %c1, %c2, %c3, %c4, %c5, %c6,
+    %d1, %d2, %d3, %d4, %d5, %d6,
+    %e1, %e2, %e3, %e4, %e5, %e6,
+    %f1, %f2, %f3, %f4, %f5, %f6)
+    : (memref<vector<[32]xi8>>, memref<vector<[16]xi8>>, memref<vector<[8]xi8>>, memref<vector<[4]xi8>>, memref<vector<[2]xi8>>, memref<vector<[1]xi8>>,
+       memref<vector<[32]xi16>>, memref<vector<[16]xi16>>, memref<vector<[8]xi16>>, memref<vector<[4]xi16>>, memref<vector<[2]xi16>>, memref<vector<[1]xi16>>,
+       memref<vector<[32]xi32>>, memref<vector<[16]xi32>>, memref<vector<[8]xi32>>, memref<vector<[4]xi32>>, memref<vector<[2]xi32>>, memref<vector<[1]xi32>>,
+       memref<vector<[32]xi64>>, memref<vector<[16]xi64>>, memref<vector<[8]xi64>>, memref<vector<[4]xi64>>, memref<vector<[2]xi64>>, memref<vector<[1]xi64>>,
+       memref<vector<[32]xf32>>, memref<vector<[16]xf32>>, memref<vector<[8]xf32>>, memref<vector<[4]xf32>>, memref<vector<[2]xf32>>, memref<vector<[1]xf32>>,
+       memref<vector<[32]xf64>>, memref<vector<[16]xf64>>, memref<vector<[8]xf64>>, memref<vector<[4]xf64>>, memref<vector<[2]xf64>>, memref<vector<[1]xf64>>) -> ()
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
new file mode 100644
index 000000000000000..260cd1eabe7dae1
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -0,0 +1,121 @@
+// RUN: mlir-opt %s -convert-vector-to-scf -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+/// This tests basic functionality of arrays of scalable vectors, which in MLIR
+/// are vectors with a single trailing scalable dimension. This test requires
+/// the -arm-sve-legalize-vector-storage pass to ensure the loads/stores done
+/// here are be legal for the LLVM backend.
+
+func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim = memref.dim %memref, %c1 : memref<3x?xf32>
+  %mask = vector.create_mask %c2, %dim : vector<3x[8]xi1>
+  %vector = vector.transfer_read %memref[%c0,%c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[8]xf32>
+
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %row0 = vector.extract %vector[0] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row1 = vector.extract %vector[1] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row2 = vector.extract %vector[2] : vector<[8]xf32> from vector<3x[8]xf32>
+
+  /// Print each of the vectors. This only checks the first eight elements (which
+  /// works for all vscale >= 1).
+
+  // CHECK-LABEL: TEST 1
+  vector.print str "TEST 1 (print and read 2D arrays of scalable vectors)"
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row0 : vector<[8]xf32>
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row1 : vector<[8]xf32>
+  /// This last row is all zero due to our mask.
+  // CHECK: ( 0, 0, 0, 0, 0, 0, 0, 0
+  vector.print %row2 : vector<[8]xf32>
+
+  return
+}
+
+func.func @print_1x2xVSCALExf32(%vector: vector<1x2x[4]xf32>) {
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %slice0 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  %slice1 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  vector.print %slice0 : vector<[4]xf32>
+  vector.print %slice1 : vector<[4]xf32>
+  return
+}
+
+func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 2 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
+  %dim_b = memref.dim %b, %c2 : memref<1x2x?xf32>
+  %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
+  %mask_b = vector.create_mask %c2, %c3, %dim_b : vector<1x2x[4]xi1>
+
+  vector.print str "TEST 2 (reading and adding two 3D arrays of scalable vectors)"
+
+  /// Print each of the vectors. This only checks the first four elements (which
+  /// works for all vscale >= 1).
+
+  // CHECK-LABEL: Vector A
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  vector.print str "\nVector A"
+  %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
+
+  vector.print str "\nVector B"
+  // CHECK-LABEL: Vector B
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
+
+  %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
+  // CHECK-LABEL: Sum
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  vector.print str "\nSum"
+  func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
+
+  return
+}
+
+func.func @entry() {
+  %vscale = vector.vscale
+
+  %c4 = arith.constant 4 : index
+  %c8 = arith.constant 8 : index
+  %f32_8 = arith.constant 8.0 : f32
+  %f32_5 = arith.constant 5.0 : f32
+  %f32_4 = arith.constant 4.0 : f32
+
+  vector.print str "\n====================\n"
+
+  %test_1_memref_size = arith.muli %vscale, %c8 : index
+  %test_1_memref = memref.alloca(%test_1_memref_size) : memref<3x?xf32>
+
+  linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
+
+  func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
+
+  vector.print str "\n====================\n"
+
+  %test_2_memref_size = arith.muli %vscale, %c4 : index
+  %test_2_memref_a = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+  %test_2_memref_b = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+
+  linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
+  linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
+
+  func.call @add_arrays_of_scalable_vectors(
+    %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
+
+  vector.print str "\n====================\n"
+
+  return
+}