[clang] [llvm] Smef16f16 (PR #88860)

via cfe-commits cfe-commits at lists.llvm.org
Tue Apr 16 02:01:34 PDT 2024


https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/88860

None

>From a4ba2e63ca1dc40422cdf398925cdb4a25f7ca83 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 15 Apr 2024 10:35:03 +0000
Subject: [PATCH 1/3] [CLANG][LLVM][AArch64]SME2.1 intrinsics for MOVAZ tile to
 2/4 vectors

According to the specification in
ARM-software/acle#309 this adds the intrinsics

// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
---
 clang/include/clang/Basic/arm_sme.td          |   23 +
 .../acle_sme2p1_movaz.c                       | 1414 +++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |   20 +-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |   99 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   49 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    3 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   27 +
 .../AArch64/sme2p1-intrinsics-movaz.ll        |  457 ++++++
 .../CodeGen/AArch64/sme2p1-intrinsics-movaz.s |    0
 9 files changed, 2090 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..44bb32916d86bb 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -674,3 +674,26 @@ let TargetGuard = "sme2" in {
   def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
   def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
+
+multiclass ZAReadz<string n_suffix, string vg_num, string t, string i_prefix, list<ImmCheck> ch> {
+  let TargetGuard = "sme2p1" in {
+    def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t,
+                          MergeNone, i_prefix # "_horiz_x" # vg_num,
+                          [IsStreaming, IsInOutZA], ch>;
+
+    def NAME # _V : SInst<"svreadz_ver_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t,
+                          MergeNone, i_prefix # "_vert_x" #vg_num,
+                          [IsStreaming, IsInOutZA], ch>;
+  }
+}
+
+defm SVREADZ_ZA8_X2  : ZAReadz<"za8",  "2", "cUc",   "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16_X2 : ZAReadz<"za16", "2", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32_X2 : ZAReadz<"za32", "2", "iUif",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64_X2 : ZAReadz<"za64", "2", "lUld",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+
+defm SVREADZ_ZA8_X4  : ZAReadz<"za8",  "4", "cUc",   "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16_X4 : ZAReadz<"za16", "4", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32_X4 : ZAReadz<"za32", "4", "iUif",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64_X4 : ZAReadz<"za64", "4", "lUld",  "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
new file mode 100644
index 00000000000000..2b80c76e3d9999
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -0,0 +1,1414 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+ //RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+//
+// X2- hor
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_hor_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_hor_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_hor_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_s8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_hor_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_hor_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_hor_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_u8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_hor_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_hor_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_hor_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_s16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_hor_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_hor_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_hor_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_u16_vg2(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> @test_svreadz_hor_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> @_Z28test_svreadz_hor_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_hor_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_f16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @test_svreadz_hor_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @_Z29test_svreadz_hor_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_hor_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_bf16_vg2(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_hor_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_hor_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_hor_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_s32_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_hor_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_hor_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_hor_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_u32_vg2(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> @test_svreadz_hor_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> @_Z28test_svreadz_hor_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_hor_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_f32_vg2(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_hor_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_hor_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_hor_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_s64_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_hor_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_hor_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_hor_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_u64_vg2(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> @test_svreadz_hor_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> @_Z28test_svreadz_hor_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_hor_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_f64_vg2(7, slice);
+}
+
+
+//
+// X2- ver
+//
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_ver_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_ver_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_ver_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_s8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_ver_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z26test_svreadz_ver_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_ver_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_u8_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_ver_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_ver_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_ver_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_s16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_ver_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z28test_svreadz_ver_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_ver_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_u16_vg2(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> @test_svreadz_ver_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> @_Z28test_svreadz_ver_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_ver_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_f16_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @test_svreadz_ver_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @_Z29test_svreadz_ver_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_ver_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_bf16_vg2(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_ver_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_ver_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_ver_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_s32_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_ver_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z28test_svreadz_ver_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_ver_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_u32_vg2(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> @test_svreadz_ver_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> @_Z28test_svreadz_ver_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_ver_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_f32_vg2(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_ver_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_ver_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_ver_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_s64_vg2(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_ver_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z28test_svreadz_ver_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_ver_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_u64_vg2(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> @test_svreadz_ver_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> @_Z28test_svreadz_ver_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_ver_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_f64_vg2(7, slice);
+}
+
+
+//
+// X4 - hor
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_hor_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_hor_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_hor_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_s8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_hor_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_hor_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_hor_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za8_u8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_hor_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_hor_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_hor_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_s16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_hor_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_hor_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_hor_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_u16_vg4(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> @test_svreadz_hor_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> @_Z28test_svreadz_hor_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_hor_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_f16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @test_svreadz_hor_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @_Z29test_svreadz_hor_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_hor_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za16_bf16_vg4(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_hor_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_hor_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_hor_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_s32_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_hor_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_hor_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_hor_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_u32_vg4(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> @test_svreadz_hor_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> @_Z28test_svreadz_hor_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_hor_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za32_f32_vg4(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_hor_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_hor_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_hor_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_s64_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_hor_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_hor_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_hor_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_u64_vg4(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> @test_svreadz_hor_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> @_Z28test_svreadz_hor_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_hor_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_hor_za64_f64_vg4(7, slice);
+}
+
+//
+// X4 - ver
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_ver_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_ver_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_ver_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_s8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_ver_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z26test_svreadz_ver_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_ver_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za8_u8_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_ver_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_ver_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_ver_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_s16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_ver_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z28test_svreadz_ver_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_ver_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_u16_vg4(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> @test_svreadz_ver_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> @_Z28test_svreadz_ver_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_ver_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_f16_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @test_svreadz_ver_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @_Z29test_svreadz_ver_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_ver_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za16_bf16_vg4(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_ver_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_ver_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_ver_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_s32_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_ver_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z28test_svreadz_ver_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_ver_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_u32_vg4(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> @test_svreadz_ver_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> @_Z28test_svreadz_ver_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_ver_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za32_f32_vg4(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_ver_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_ver_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_ver_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_s64_vg4(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_ver_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z28test_svreadz_ver_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_ver_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_u64_vg4(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> @test_svreadz_ver_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> @_Z28test_svreadz_ver_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_ver_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   return svreadz_ver_za64_f64_vg4(7, slice);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bcaa37de74b630..572fd5ecb8dae4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2839,6 +2839,24 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic;
   def int_aarch64_sme_writeq_vert  : SME_VectorToTile_Intrinsic;
 
+  class SME_MOVAZ_TileToVector_X2_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+  class SME_MOVAZ_TileToVector_X4_Intrinsic
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+           LLVMMatchType<0>,LLVMMatchType<0>],
+          [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+  def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
+  def int_aarch64_sme_readz_vert_x2  : SME_MOVAZ_TileToVector_X2_Intrinsic;
+
+  def int_aarch64_sme_readz_horiz_x4 : SME_MOVAZ_TileToVector_X4_Intrinsic;
+  def int_aarch64_sme_readz_vert_x4  : SME_MOVAZ_TileToVector_X4_Intrinsic;
+
+
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
   class SME_OuterProduct_Intrinsic
@@ -3646,4 +3664,4 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
 def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
    
-def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
\ No newline at end of file
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 51bec3604026b0..07e6ac4e1b828e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -392,7 +392,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   template <unsigned MaxIdx, unsigned Scale>
   void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
                              unsigned Op);
-
+  template <unsigned MaxIdx, unsigned Scale>
+  void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op);
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
   template <int64_t Min, int64_t Max>
@@ -1985,6 +1986,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
   CurDAG->RemoveDeadNode(N);
 }
 
+template <unsigned MaxIdx, unsigned Scale>
+void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
+                                                 unsigned Op) {
+
+  SDValue SliceBase = N->getOperand(3);
+  SDValue Base, Offset;
+  if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale))
+    return;
+  // The correct Za tile number is computed in Machine Instruction
+  // See EmitTileMovaz
+  // DAG cannot select Za tile as an output register with ZReg
+  SDLoc DL(N);
+  SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset,
+                   /*Chain*/ N->getOperand(0)};
+  SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops);
+
+  EVT VT = N->getValueType(0);
+  for (unsigned I = 0; I < NumVecs; ++I)
+    ReplaceUses(SDValue(N, I),
+                CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT,
+                                               SDValue(Mov, 0)));
+
+  // Copy chain
+  unsigned ChainIdx = NumVecs;
+  ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N,
                                                     unsigned NumOutVecs,
                                                     bool IsTupleInput,
@@ -5175,6 +5204,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                                   AArch64::MOVA_VG4_4ZMXI);
       return;
     }
+    case Intrinsic::aarch64_sme_readz_horiz_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ<14, 2>(Node, 2, AArch64::MOVAZ_2ZMI_H_B_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ<6, 2>(Node, 2, AArch64::MOVAZ_2ZMI_H_H_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ<2, 2>(Node, 2, AArch64::MOVAZ_2ZMI_H_S_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ<0, 2>(Node, 2, AArch64::MOVAZ_2ZMI_H_D_PSEUDO);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_vert_x2: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ<14, 2>(Node, 2, AArch64::MOVAZ_2ZMI_V_B_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ<6, 2>(Node, 2, AArch64::MOVAZ_2ZMI_V_H_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ<2, 2>(Node, 2, AArch64::MOVAZ_2ZMI_V_S_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ<0, 2>(Node, 2, AArch64::MOVAZ_2ZMI_V_D_PSEUDO);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_horiz_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ<12, 4>(Node, 4, AArch64::MOVAZ_4ZMI_H_B_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ<4, 4>(Node, 4, AArch64::MOVAZ_4ZMI_H_H_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ<0, 4>(Node, 4, AArch64::MOVAZ_4ZMI_H_S_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ<0, 4>(Node, 4, AArch64::MOVAZ_4ZMI_H_D_PSEUDO);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sme_readz_vert_x4: {
+      if (VT == MVT::nxv16i8) {
+        SelectMultiVectorMoveZ<12, 4>(Node, 4, AArch64::MOVAZ_4ZMI_V_B_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 VT == MVT::nxv8bf16) {
+        SelectMultiVectorMoveZ<4, 4>(Node, 4, AArch64::MOVAZ_4ZMI_V_H_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectMultiVectorMoveZ<0, 4>(Node, 4, AArch64::MOVAZ_4ZMI_V_S_PSEUDO);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectMultiVectorMoveZ<0, 4>(Node, 4, AArch64::MOVAZ_4ZMI_V_D_PSEUDO);
+        return;
+      }
+      break;
+    }
     case Intrinsic::swift_async_context_addr: {
       SDLoc DL(Node);
       SDValue Chain = Node->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80181a77c9d238..1b7a5f180c484c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2832,6 +2832,23 @@ AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+                                     MachineInstr &MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+  MIB.add(MI.getOperand(0)); // ZReg
+  MIB.addReg(BaseReg + MI.getOperand(1).getImm(),
+             RegState::Define);                    // add as output
+  MIB.addReg(BaseReg + MI.getOperand(1).getImm()); // add as input
+  MIB.add(MI.getOperand(2));                       // slice index register
+  MIB.add(MI.getOperand(3));                       // slice index offset
+  MI.eraseFromParent();                            // The pseudo is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -2992,6 +3009,38 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitZero(MI, BB);
   case AArch64::ZERO_T_PSEUDO:
     return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
+  case AArch64::MOVAZ_2ZMI_H_B_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_H_B, AArch64::ZAB0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_H_H_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_H_H, AArch64::ZAH0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_H_S_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_H_S, AArch64::ZAS0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_H_D_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_H_D, AArch64::ZAD0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_V_B_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_V_B, AArch64::ZAB0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_V_H_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_V_H, AArch64::ZAH0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_V_S_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_V_S, AArch64::ZAS0, MI, BB);
+  case AArch64::MOVAZ_2ZMI_V_D_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_2ZMI_V_D, AArch64::ZAD0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_H_B_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_H_B, AArch64::ZAB0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_H_H_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_H_H, AArch64::ZAH0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_H_S_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_H_S, AArch64::ZAS0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_H_D_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_H_D, AArch64::ZAD0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_V_B_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_V_B, AArch64::ZAB0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_V_H_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_V_H, AArch64::ZAH0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_V_S_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_V_S, AArch64::ZAS0, MI, BB);
+  case AArch64::MOVAZ_4ZMI_V_D_PSEUDO:
+    return EmitTileMovaz(AArch64::MOVAZ_4ZMI_V_D, AArch64::ZAD0, MI, BB);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 18439dc7f01020..8c86b47a701cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -635,6 +635,9 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                   MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+                                   MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
                                  MachineInstr &MI, MachineBasicBlock *BB,
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3363aab4b093cc..d59df7e3f23bc8 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,12 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
   let usesCustomInserter = 1;
 }
 
+class sme2_movez_to_tile_multi_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
+    : SMEPseudo2Instr<name, 0>,
+      Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
+  let SMEMatrixType = za_flag;
+  let usesCustomInserter = 1;
+}
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -4033,6 +4039,17 @@ multiclass sme2_mova_tile_to_vec_vg2_multi<string mnemonic>{
 multiclass sme2p1_movaz_tile_to_vec_vg2<string mnemonic>{
  defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>;
  defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b010, mnemonic>;
+
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_0, uimm3s2range, ZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_1, uimm2s2range, ZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_3, uimm1s2range, ZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_7, uimm0s2range, ZZ_d_mul_r, SMEMatrixTileD>;
 }
 
 class sme2_mova_tile_to_vec_vg4_multi_base<bits<2> sz, bit v, bits<6> op,
@@ -4164,6 +4181,16 @@ multiclass sme2_mova_tile_to_vec_vg4_multi<string mnemonic>{
 multiclass sme2p1_movaz_tile_to_vec_vg4<string mnemonic>{
  defm _H : sme2_mova_tile_to_vec_vg4_multi_base<0b0, 0b110, mnemonic>;
  defm _V : sme2_mova_tile_to_vec_vg4_multi_base<0b1, 0b110, mnemonic>;
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_0, uimm2s4range, ZZZZ_b_mul_r, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_1, uimm1s4range, ZZZZ_h_mul_r, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_3, uimm0s4range, ZZZZ_s_mul_r, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo<NAME, sme_elm_idx0_7, uimm0s4range, ZZZZ_d_mul_r, SMEMatrixTileD>;
 }
 
 
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
new file mode 100644
index 00000000000000..53207e543dd39b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
@@ -0,0 +1,457 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s
+
+;MOVAZ (tile to vector, Multi)
+
+
+;;
+; X2 - Horiz
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x2(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z8_i8_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_i32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_i64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_f32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_f64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X2- Vert
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x2(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z8_i8_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 14
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_i32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_i64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 6
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_f32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
+; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 2
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_f64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
+; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X4 - Horiz
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x4(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z8_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 12
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_i32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 3, i32 %slice)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_i64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_f32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 %slice)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_f64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+;;
+; X4 - Vert
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x4(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z8_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 12
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_i32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 3, i32 %slice)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_i64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 7, i32 %slice)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 %slice)
+  %slice.max = add i32 %slice, 4
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 1, i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_f32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
+; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 0, i32 %slice)
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 %slice)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_f64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
+; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 0, i32 %slice)
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 %slice)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x2.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x2.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x4.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32, i32)
+
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x4.nxv16i8(i32, i32)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32, i32)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32, i32)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32, i32)
+declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32, i32)
+declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32, i32)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32, i32)
+declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32, i32)
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s
new file mode 100644
index 00000000000000..e69de29bb2d1d6

>From bf03a05f3c1ba402d3991f68307106eb3494cf60 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 15 Apr 2024 10:42:36 +0000
Subject: [PATCH 2/3] Remove unwanted test file

---
 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s

diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.s
deleted file mode 100644
index e69de29bb2d1d6..00000000000000

>From 140312b12d8ae81fcbfc8c09c91b20e2f0a3be58 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 15 Apr 2024 13:31:00 +0000
Subject: [PATCH 3/3] [LLVM][AARCH64]Replace +sme2p1+smef16f16 by +smef16f16

According to the latest ISA Spec release[1] all instructions under:
 HasSME2p1 and HasSMEF16F16
should now only require:
HasSMEF16F16

[1]https://developer.arm.com
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 +-
 llvm/lib/Target/AArch64/AArch64.td            |   2 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   2 +-
 .../test/MC/AArch64/SME2p1/fadd-diagnostics.s |   2 +-
 llvm/test/MC/AArch64/SME2p1/fadd.s            | 110 +++----
 llvm/test/MC/AArch64/SME2p1/fcvt.s            |  20 +-
 .../MC/AArch64/SME2p1/fcvtl-diagnostics.s     |   2 +-
 llvm/test/MC/AArch64/SME2p1/fcvtl.s           |  20 +-
 .../test/MC/AArch64/SME2p1/fmla-diagnostics.s |   4 +-
 llvm/test/MC/AArch64/SME2p1/fmla.s            | 300 +++++++++---------
 .../test/MC/AArch64/SME2p1/fmls-diagnostics.s |   4 +-
 llvm/test/MC/AArch64/SME2p1/fmls.s            | 300 +++++++++---------
 .../MC/AArch64/SME2p1/fmopa-diagnostics.s     |   2 +-
 llvm/test/MC/AArch64/SME2p1/fmopa.s           |  36 +--
 .../MC/AArch64/SME2p1/fmops-diagnostics.s     |   2 +-
 llvm/test/MC/AArch64/SME2p1/fmops.s           |  36 +--
 .../test/MC/AArch64/SME2p1/fsub-diagnostics.s |   3 +-
 llvm/test/MC/AArch64/SME2p1/fsub.s            | 108 +++----
 18 files changed, 479 insertions(+), 476 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13c7..276cb66e80acbb 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -302,7 +302,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
-    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+sme2,+fp8", 0},
+    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+sme2,+fp8,+sme-f16f16", 0},
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 741c97a3dc009b..e9bc79578fde5f 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -554,7 +554,7 @@ def FeatureSME_LUTv2 : SubtargetFeature<"sme-lutv2", "HasSME_LUTv2", "true",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
 def FeatureSMEF8F16 : SubtargetFeature<"sme-f8f16", "HasSMEF8F16", "true",
-  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8, FeatureSMEF16F16]>;
 
 def FeatureSMEF8F32 : SubtargetFeature<"sme-f8f32", "HasSMEF8F32", "true",
   "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa25343450..37451df5bfcb08 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -792,7 +792,7 @@ defm LUTI4_S_2ZTZI : sme2p1_luti4_vector_vg2_index<"luti4">;
 defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">;
 }
 
-let Predicates = [HasSME2p1, HasSMEF16F16] in {
+let Predicates = [HasSMEF16F16] in {
 defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
index c13a1be05b1cd0..a2fcb6622002b5 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Out of range index offset
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd.s b/llvm/test/MC/AArch64/SME2p1/fadd.s
index a8e64a63dbdb60..a7a4db4e979426 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd.s
@@ -1,300 +1,302 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fadd    za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvt.s b/llvm/test/MC/AArch64/SME2p1/fcvt.s
index b5707bad0a24e3..2731055dedec41 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvt.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvt.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvt    {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000000
 // CHECK-INST: fcvt    { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e000 <unknown>
 
 fcvt    {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010100
 // CHECK-INST: fcvt    { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x54,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e154 <unknown>
 
 fcvt    {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110110
 // CHECK-INST: fcvt    { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb6,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b6 <unknown>
 
 fcvt    {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111110
 // CHECK-INST: fcvt    { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xfe,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3fe <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
index a723d2fc6f3ac9..ad3eaba7bdc211 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl.s b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
index 31cf90d037969c..6284915e49831b 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvtl   {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000001
 // CHECK-INST: fcvtl   { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x01,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e001 <unknown>
 
 fcvtl   {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010101
 // CHECK-INST: fcvtl   { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x55,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e155 <unknown>
 
 fcvtl   {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110111
 // CHECK-INST: fcvtl   { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb7,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b7 <unknown>
 
 fcvtl   {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111111
 // CHECK-INST: fcvtl   { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xff,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3ff <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
index d32f795728a259..c31b54fc05deaf 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
@@ -66,7 +66,7 @@ fmla za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
 // CHECK-NEXT: fmla za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla.s b/llvm/test/MC/AArch64/SME2p1/fmla.s
index 10529d81eed634..df9ac8076e5664 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla.s
@@ -1,877 +1,877 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
index 2174e4202ba0d0..2deb18186eafca 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
@@ -66,7 +66,7 @@ fmls za.h[w8, 8, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // Invalid Register Suffix
 
 fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .h
 // CHECK-NEXT: fmls za.d[w8, 7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls.s b/llvm/test/MC/AArch64/SME2p1/fmls.s
index 9bbb21869e3779..67b1430240e8d9 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls.s
@@ -1,878 +1,878 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
index def19a316c2ac8..1c561959c25e0f 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa.s b/llvm/test/MC/AArch64/SME2p1/fmopa.s
index e53d21244fde39..0a586d3acc4222 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa.s
@@ -1,85 +1,85 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmopa   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00001000
 // CHECK-INST: fmopa   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x08,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800008 <unknown>
 
 fmopa   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01001001
 // CHECK-INST: fmopa   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x49,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955549 <unknown>
 
 fmopa   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10101001
 // CHECK-INST: fmopa   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xa9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188eda9 <unknown>
 
 fmopa   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11101001
 // CHECK-INST: fmopa   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xe9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819fffe9 <unknown>
 
 fmopa   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00101001
 // CHECK-INST: fmopa   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x29,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e29 <unknown>
 
 fmopa   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00101001
 // CHECK-INST: fmopa   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x29,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8429 <unknown>
 
 fmopa   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01101000
 // CHECK-INST: fmopa   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x68,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945668 <unknown>
 
 fmopa   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10001000
 // CHECK-INST: fmopa   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x88,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821988 <unknown>
 
 fmopa   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00101001
 // CHECK-INST: fmopa   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x29,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac829 <unknown>
 
 fmopa   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11001001
 // CHECK-INST: fmopa   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xc9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ac9 <unknown>
 
 fmopa   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00101000
 // CHECK-INST: fmopa   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x28,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f528 <unknown>
 
 fmopa   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10001001
 // CHECK-INST: fmopa   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x89,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba989 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
index 75eea811326244..0ec227ae0e68ec 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops.s b/llvm/test/MC/AArch64/SME2p1/fmops.s
index 325d4c125b6064..597665d5915013 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops.s
@@ -1,84 +1,84 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmops   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00011000
 // CHECK-INST: fmops   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x18,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800018 <unknown>
 
 fmops   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01011001
 // CHECK-INST: fmops   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x59,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955559 <unknown>
 
 fmops   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10111001
 // CHECK-INST: fmops   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188edb9 <unknown>
 
 fmops   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11111001
 // CHECK-INST: fmops   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xf9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ffff9 <unknown>
 
 fmops   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00111001
 // CHECK-INST: fmops   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x39,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e39 <unknown>
 
 fmops   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00111001
 // CHECK-INST: fmops   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x39,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8439 <unknown>
 
 fmops   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01111000
 // CHECK-INST: fmops   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x78,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945678 <unknown>
 
 fmops   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10011000
 // CHECK-INST: fmops   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x98,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821998 <unknown>
 
 fmops   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00111001
 // CHECK-INST: fmops   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x39,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac839 <unknown>
 
 fmops   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11011001
 // CHECK-INST: fmops   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xd9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ad9 <unknown>
 
 fmops   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00111000
 // CHECK-INST: fmops   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x38,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f538 <unknown>
 
 fmops   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10011001
 // CHECK-INST: fmops   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x99,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba999 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
index 716427a2f725d1..2a59eb02450e76 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
@@ -1,5 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
-
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 2>&1 < %s | FileCheck %s
 // --------------------------------------------------------------------------//
 // Out of range index offset
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub.s b/llvm/test/MC/AArch64/SME2p1/fsub.s
index b3735d554765ac..5574c1e8d9e6da 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub.s
@@ -1,296 +1,298 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 
 fsub    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c08 <unknown>
 
 fsub    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 
 fsub    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>



More information about the cfe-commits mailing list