[clang] [llvm] [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ array to vector (PR #88901)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Apr 16 07:30:47 PDT 2024
https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/88901
According to the specification in
ARM-software/acle#309 this adds the intrinsics
Move and zero multiple ZA single-vector groups to vector registers
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_inout("za");
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_inout("za");
>From 66453f2c8b434ef97676fc27c688402f266f2592 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Tue, 16 Apr 2024 08:50:29 +0000
Subject: [PATCH 1/2] Movaz
---
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 5 +++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa25343450..3b7c2437f8ae94 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -780,7 +780,7 @@ let Predicates = [HasSME2p1] in {
defm MOVAZ_ZMI : sme2p1_movaz_tile_to_vec<"movaz">;
defm MOVAZ_2ZMI : sme2p1_movaz_tile_to_vec_vg2<"movaz">;
defm MOVAZ_4ZMI : sme2p1_movaz_tile_to_vec_vg4<"movaz">;
-defm MOVAZ_VG2_2ZM : sme2_mova_array_to_vec_vg2_multi<0b010, "movaz">;
+defm MOVAZ_VG2_2ZM : sme2_movaz_array_to_vec_vg2_multi<"movaz">;
defm MOVAZ_VG4_4ZM : sme2_mova_array_to_vec_vg4_multi<0b1100, "movaz">;
defm ZERO_MXI : sme2p1_zero_matrix<"zero">;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3363aab4b093cc..42a764350f2989 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -4262,6 +4262,11 @@ multiclass sme2_mova_array_to_vec_vg2_multi<bits<3> opc, string mnemonic> {
}
}
+multiclass sme2_movaz_array_to_vec_vg2_multi<string mnemonic> {
+ def NAME : sme2_mova_array_to_vec_vg2_multi<0b010, mnemonic>;
+ def NAME # _PSEUDO : sme2_movaz_to_tile_multi_pseudo
+}
+
// move array to vector, four registers
multiclass sme2_mova_array_to_vec_vg4_multi<bits<4> opc, string mnemonic> {
def NAME : sme2_mova_array_to_vec_vg24_multi<opc, ZZZZ_d_mul_r, MatrixOp64,
>From 560f36d0afba7e69c3e2d3e96156f7597b776c34 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Tue, 16 Apr 2024 14:04:11 +0000
Subject: [PATCH 2/2] [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ
array to vector
According to the specification in
ARM-software/acle#309 this adds the intrinsics
Move and zero multiple ZA single-vector groups to vector registers
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_inout("za");
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_inout("za");
---
clang/include/clang/Basic/arm_sme.td | 12 +
.../acle_sme2p1_movaz.c | 711 ++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +-
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 75 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 26 +
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 22 +-
.../AArch64/sme2p1-intrinsics-movaz.ll | 222 ++++++
9 files changed, 1082 insertions(+), 6 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..128a846ac40dfc 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -674,3 +674,15 @@ let TargetGuard = "sme2" in {
def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}
+
+multiclass ZAReadz<string vg_num>{
+ let TargetGuard = "sme2p1" in {
+ def NAME # _B : SInst<"svreadz_za8_{d}_vg1x" # vg_num, vg_num # "m", "cUc", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+ def NAME # _H : SInst<"svreadz_za16_{d}_vg1x" # vg_num, vg_num # "m", "sUsbh", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+ def NAME # _S : SInst<"svreadz_za32_{d}_vg1x" # vg_num, vg_num # "m", "iUif", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+ def NAME # _D : SInst<"svreadz_za64_{d}_vg1x" # vg_num, vg_num # "m", "lUld", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+ }
+}
+
+defm SVREADZ_VG2 : ZAReadz<"2">;
+defm SVREADZ_VG4 : ZAReadz<"4">;
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
new file mode 100644
index 00000000000000..b7dab63501f303
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -0,0 +1,711 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+ //RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+// MOVAZ Array to Tile
+//
+// X2
+//
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z22test_svreadz_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za8_s8_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> @_Z22test_svreadz_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za8_u8_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z24test_svreadz_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_s16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> @_Z24test_svreadz_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_u16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z24test_svreadz_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_s32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> @_Z24test_svreadz_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_u32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z24test_svreadz_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_s64_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> @_Z24test_svreadz_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_u64_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @test_svreadz_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> @_Z25test_svreadz_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_bf16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> @test_svreadz_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT: ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> @_Z24test_svreadz_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_f16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> @test_svreadz_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> @_Z24test_svreadz_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_f32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> @test_svreadz_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT: ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> @_Z24test_svreadz_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_f64_vg1x2(slice);
+}
+
+//
+// X4
+//
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z22test_svreadz_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za8_s8_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> @_Z22test_svreadz_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT: ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za8_u8_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z24test_svreadz_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_s16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> @_Z24test_svreadz_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_u16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z24test_svreadz_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_s32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> @_Z24test_svreadz_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_u32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z24test_svreadz_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_s64_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> @_Z24test_svreadz_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_u64_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @test_svreadz_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> @_Z25test_svreadz_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_bf16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> @test_svreadz_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CHECK-NEXT: ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> @_Z24test_svreadz_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT: ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za16_f16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> @test_svreadz_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT: ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> @_Z24test_svreadz_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT: ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za32_f32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> @test_svreadz_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
+// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CHECK-NEXT: ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> @_Z24test_svreadz_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT: ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_za64_f64_vg1x4(slice);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bcaa37de74b630..3980e71491a526 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2839,6 +2839,17 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic;
def int_aarch64_sme_writeq_vert : SME_VectorToTile_Intrinsic;
+
+ def int_aarch64_sme_readz_x2
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+ [llvm_i32_ty],
+ [IntrNoMem, IntrHasSideEffects]>;
+
+ def int_aarch64_sme_readz_x4
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+ [llvm_i32_ty],
+ [IntrNoMem, IntrHasSideEffects]>;
+
def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
class SME_OuterProduct_Intrinsic
@@ -3646,4 +3657,4 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
-def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
\ No newline at end of file
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 51bec3604026b0..88f2e0dacce4b7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -392,7 +392,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
template <unsigned MaxIdx, unsigned Scale>
void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
unsigned Op);
-
+ template <unsigned MaxIdx, unsigned Scale>
+ void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned BaseReg,
+ unsigned Op);
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
/// SVE Reg+Imm addressing mode.
template <int64_t Min, int64_t Max>
@@ -1985,6 +1987,35 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
CurDAG->RemoveDeadNode(N);
}
+template <unsigned MaxIdx, unsigned Scale>
+void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
+ unsigned BaseReg,
+ unsigned Op) {
+
+ SDValue SliceBase = N->getOperand(2);
+
+ SDValue Base, Offset;
+ if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale))
+ return;
+ // The correct Za tile number is computed in Machine Instruction
+ // See EmitTileMovaz
+ // DAG cannot select Za tile as an output register with ZReg
+ SDLoc DL(N);
+ SDValue Ops[] = {Base, Offset, /*Chain*/ N->getOperand(0)};
+ SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops);
+
+ EVT VT = N->getValueType(0);
+ for (unsigned I = 0; I < NumVecs; ++I)
+ ReplaceUses(SDValue(N, I),
+ CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT,
+ SDValue(Mov, 0)));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N,
unsigned NumOutVecs,
bool IsTupleInput,
@@ -5175,6 +5206,48 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::MOVA_VG4_4ZMXI);
return;
}
+ case Intrinsic::aarch64_sme_readz_x2: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 2, AArch64::ZA,
+ AArch64::MOVAZ_VG2_2ZMXI_B_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 2, AArch64::ZA,
+ AArch64::MOVAZ_VG2_2ZMXI_H_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 2, AArch64::ZA,
+ AArch64::MOVAZ_VG2_2ZMXI_S_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 2, AArch64::ZA,
+ AArch64::MOVAZ_VG2_2ZMXI_D_PSEUDO);
+ return;
+ }
+ break;
+ }
+ case Intrinsic::aarch64_sme_readz_x4: {
+ if (VT == MVT::nxv16i8) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 4, AArch64::ZA,
+ AArch64::MOVAZ_VG4_4ZMXI_B_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+ VT == MVT::nxv8bf16) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 4, AArch64::ZA,
+ AArch64::MOVAZ_VG4_4ZMXI_H_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 4, AArch64::ZA,
+ AArch64::MOVAZ_VG4_4ZMXI_S_PSEUDO);
+ return;
+ } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+ SelectMultiVectorMoveZ<7, 1>(Node, 4, AArch64::ZA,
+ AArch64::MOVAZ_VG4_4ZMXI_D_PSEUDO);
+ return;
+ }
+ break;
+ }
case Intrinsic::swift_async_context_addr: {
SDLoc DL(Node);
SDValue Chain = Node->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80181a77c9d238..c5fdcf87744310 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2832,6 +2832,22 @@ AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.add(MI.getOperand(0)); // ZReg
+ MIB.addReg(BaseReg, RegState::Define); // add as output
+ MIB.addReg(BaseReg); // add as input
+ MIB.add(MI.getOperand(1)); // slice index register
+ MIB.add(MI.getOperand(2)); // slice index offset
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
MachineBasicBlock *
AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -2992,6 +3008,16 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitZero(MI, BB);
case AArch64::ZERO_T_PSEUDO:
return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
+ case AArch64::MOVAZ_VG2_2ZMXI_B_PSEUDO:
+ case AArch64::MOVAZ_VG2_2ZMXI_H_PSEUDO:
+ case AArch64::MOVAZ_VG2_2ZMXI_S_PSEUDO:
+ case AArch64::MOVAZ_VG2_2ZMXI_D_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_VG2_2ZMXI, AArch64::ZA, MI, BB);
+ case AArch64::MOVAZ_VG4_4ZMXI_B_PSEUDO:
+ case AArch64::MOVAZ_VG4_4ZMXI_H_PSEUDO:
+ case AArch64::MOVAZ_VG4_4ZMXI_S_PSEUDO:
+ case AArch64::MOVAZ_VG4_4ZMXI_D_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_VG4_4ZMXI, AArch64::ZA, MI, BB);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 18439dc7f01020..8c86b47a701cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -635,6 +635,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
MachineInstr &MI, MachineBasicBlock *BB,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 3b7c2437f8ae94..b0e6dc16390488 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -780,8 +780,8 @@ let Predicates = [HasSME2p1] in {
defm MOVAZ_ZMI : sme2p1_movaz_tile_to_vec<"movaz">;
defm MOVAZ_2ZMI : sme2p1_movaz_tile_to_vec_vg2<"movaz">;
defm MOVAZ_4ZMI : sme2p1_movaz_tile_to_vec_vg4<"movaz">;
-defm MOVAZ_VG2_2ZM : sme2_movaz_array_to_vec_vg2_multi<"movaz">;
-defm MOVAZ_VG4_4ZM : sme2_mova_array_to_vec_vg4_multi<0b1100, "movaz">;
+defm MOVAZ_VG2_2ZMXI : sme2_movaz_array_to_vec_vg2_multi<"movaz">;
+defm MOVAZ_VG4_4ZMXI : sme2_movaz_array_to_vec_vg4_multi<"movaz">;
defm ZERO_MXI : sme2p1_zero_matrix<"zero">;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 42a764350f2989..e565575c8f0e91 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,13 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
let usesCustomInserter = 1;
}
+class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
+ SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -4263,8 +4270,11 @@ multiclass sme2_mova_array_to_vec_vg2_multi<bits<3> opc, string mnemonic> {
}
multiclass sme2_movaz_array_to_vec_vg2_multi<string mnemonic> {
- def NAME : sme2_mova_array_to_vec_vg2_multi<0b010, mnemonic>;
- def NAME # _PSEUDO : sme2_movaz_to_tile_multi_pseudo
+ defm NAME : sme2_mova_array_to_vec_vg2_multi<0b010, mnemonic>;
+ def NAME # _B_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, SMEMatrixArray>;
+ def NAME # _H_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZ_h_mul_r, SMEMatrixArray>;
+ def NAME # _S_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZ_s_mul_r, SMEMatrixArray>;
+ def NAME # _D_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZ_d_mul_r, SMEMatrixArray>;
}
// move array to vector, four registers
@@ -4342,6 +4352,14 @@ multiclass sme2_mova_array_to_vec_vg4_multi<bits<4> opc, string mnemonic> {
}
}
+multiclass sme2_movaz_array_to_vec_vg4_multi<string mnemonic> {
+ defm NAME : sme2_mova_array_to_vec_vg4_multi<0b1100, mnemonic>;
+ def NAME # _B_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, SMEMatrixArray>;
+ def NAME # _H_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZZZ_h_mul_r, SMEMatrixArray>;
+ def NAME # _S_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZZZ_s_mul_r, SMEMatrixArray>;
+ def NAME # _D_PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, ZZZZ_d_mul_r, SMEMatrixArray>;
+}
+
//===----------------------------------------------------------------------===//
// SME2 multi-vec saturating shift right narrow
class sme2_sat_shift_vector_vg2<string mnemonic, bit op, bit u>
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
new file mode 100644
index 00000000000000..deda57ead3b282
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s
+
+
+;MOVAZ (array to vector, Multi)
+
+
+;;
+; X2
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z8_i8_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice.max)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_i16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice.max)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z32_i32_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice.max)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z64_i64_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice.max)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res2
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_bf16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice.max)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_f16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice.max)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z32_f32_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice.max)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x2(i32 %slice) {
+; CHECK-LABEL: test_readz_z64_f64_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice.max)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>} %res2
+}
+
+;;
+; X4
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z8_i8_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice.max)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_i16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice.max)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z32_i32_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice.max)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z64_i64_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice.max)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %res2
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_bf16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice.max)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z16_f16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice.max)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z32_f32_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice.max)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x4(i32 %slice) {
+; CHECK-LABEL: test_readz_z64_f64_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice.max)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res2
+}
More information about the cfe-commits
mailing list